]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: use generated loops instead of DUFFCOPY on riscv64
authorMeng Zhuo <mengzhuo@iscas.ac.cn>
Wed, 3 Sep 2025 01:55:56 +0000 (09:55 +0800)
committerMeng Zhuo <mengzhuo@iscas.ac.cn>
Wed, 10 Sep 2025 02:42:09 +0000 (19:42 -0700)
MemmoveKnownSize112-4            632.1Mi ± 1%   1288.5Mi ± 0%  +103.85% (p=0.000 n=10)
MemmoveKnownSize128-4            636.1Mi ± 0%   1280.9Mi ± 1%  +101.36% (p=0.000 n=10)
MemmoveKnownSize192-4            645.3Mi ± 0%   1306.9Mi ± 1%  +102.53% (p=0.000 n=10)
MemmoveKnownSize248-4            650.2Mi ± 2%   1312.5Mi ± 1%  +101.87% (p=0.000 n=10)
MemmoveKnownSize256-4            650.7Mi ± 0%   1303.6Mi ± 1%  +100.33% (p=0.000 n=10)
MemmoveKnownSize512-4            658.2Mi ± 1%   1293.9Mi ± 0%   +96.60% (p=0.000 n=10)
MemmoveKnownSize1024-4           662.1Mi ± 0%   1312.6Mi ± 0%   +98.26% (p=0.000 n=10)

Change-Id: I43681ca029880025558b33ddc4295da3947c9b28
Reviewed-on: https://go-review.googlesource.com/c/go/+/700537
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Mark Freeman <markfreeman@google.com>
src/cmd/compile/internal/riscv64/ssa.go
src/cmd/compile/internal/ssa/_gen/RISCV64.rules
src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteRISCV64.go

index da281974907011e1dd372c43cdc468c6ba02f981..43a1ec4b616ae7d9c3e2934abc6c753c199950a7 100644 (file)
@@ -822,44 +822,99 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                }
 
        case ssa.OpRISCV64LoweredMove:
-               mov, sz := largestMove(v.AuxInt)
+               dst := v.Args[0].Reg()
+               src := v.Args[1].Reg()
+               if dst == src {
+                       break
+               }
 
-               //      mov     (Rarg1), T2
-               //      mov     T2, (Rarg0)
-               //      ADD     $sz, Rarg0
-               //      ADD     $sz, Rarg1
-               //      BGEU    Rarg2, Rarg0, -4(PC)
+               sa := v.AuxValAndOff()
+               n := sa.Val64()
+               mov, sz := largestMove(sa.Off64())
 
-               p := s.Prog(mov)
-               p.From.Type = obj.TYPE_MEM
-               p.From.Reg = v.Args[1].Reg()
+               var off int64
+               tmp := int16(riscv.REG_X5)
+               for n >= sz {
+                       moveOp(s, mov, dst, src, tmp, off)
+                       off += sz
+                       n -= sz
+               }
+
+               for i := len(fracMovOps) - 1; i >= 0; i-- {
+                       tsz := int64(1 << i)
+                       if n < tsz {
+                               continue
+                       }
+                       moveOp(s, fracMovOps[i], dst, src, tmp, off)
+                       off += tsz
+                       n -= tsz
+               }
+
+       case ssa.OpRISCV64LoweredMoveLoop:
+               dst := v.Args[0].Reg()
+               src := v.Args[1].Reg()
+               if dst == src {
+                       break
+               }
+
+               sc := v.AuxValAndOff()
+               n := sc.Val64()
+               mov, sz := largestMove(sc.Off64())
+               chunk := 8 * sz
+
+               if n <= 3*chunk {
+                       v.Fatalf("MoveLoop too small:%d, expect:%d", n, 3*chunk)
+               }
+               tmp := int16(riscv.REG_X5)
+
+               p := s.Prog(riscv.AADD)
+               p.From.Type = obj.TYPE_CONST
+               p.From.Offset = n - n%chunk
+               p.Reg = src
                p.To.Type = obj.TYPE_REG
-               p.To.Reg = riscv.REG_T2
+               p.To.Reg = riscv.REG_X6
 
-               p2 := s.Prog(mov)
-               p2.From.Type = obj.TYPE_REG
-               p2.From.Reg = riscv.REG_T2
-               p2.To.Type = obj.TYPE_MEM
-               p2.To.Reg = v.Args[0].Reg()
-
-               p3 := s.Prog(riscv.AADD)
-               p3.From.Type = obj.TYPE_CONST
-               p3.From.Offset = sz
-               p3.To.Type = obj.TYPE_REG
-               p3.To.Reg = v.Args[0].Reg()
-
-               p4 := s.Prog(riscv.AADD)
-               p4.From.Type = obj.TYPE_CONST
-               p4.From.Offset = sz
-               p4.To.Type = obj.TYPE_REG
-               p4.To.Reg = v.Args[1].Reg()
+               for i := int64(0); i < 8; i++ {
+                       moveOp(s, mov, dst, src, tmp, sz*i)
+               }
 
-               p5 := s.Prog(riscv.ABGEU)
-               p5.To.Type = obj.TYPE_BRANCH
-               p5.Reg = v.Args[1].Reg()
-               p5.From.Type = obj.TYPE_REG
-               p5.From.Reg = v.Args[2].Reg()
-               p5.To.SetTarget(p)
+               p1 := s.Prog(riscv.AADD)
+               p1.From.Type = obj.TYPE_CONST
+               p1.From.Offset = chunk
+               p1.To.Type = obj.TYPE_REG
+               p1.To.Reg = src
+
+               p2 := s.Prog(riscv.AADD)
+               p2.From.Type = obj.TYPE_CONST
+               p2.From.Offset = chunk
+               p2.To.Type = obj.TYPE_REG
+               p2.To.Reg = dst
+
+               p3 := s.Prog(riscv.ABNE)
+               p3.From.Reg = riscv.REG_X6
+               p3.From.Type = obj.TYPE_REG
+               p3.Reg = src
+               p3.To.Type = obj.TYPE_BRANCH
+               p3.To.SetTarget(p.Link)
+
+               n %= chunk
+
+               var off int64
+               for n >= sz {
+                       moveOp(s, mov, dst, src, tmp, off)
+                       off += sz
+                       n -= sz
+               }
+
+               for i := len(fracMovOps) - 1; i >= 0; i-- {
+                       tsz := int64(1 << i)
+                       if n < tsz {
+                               continue
+                       }
+                       moveOp(s, fracMovOps[i], dst, src, tmp, off)
+                       off += tsz
+                       n -= tsz
+               }
 
        case ssa.OpRISCV64LoweredNilCheck:
                // Issue a load which will fault if arg is nil.
@@ -1023,3 +1078,21 @@ func zeroOp(s *ssagen.State, mov obj.As, reg int16, off int64) {
        p.To.Offset = off
        return
 }
+
+func moveOp(s *ssagen.State, mov obj.As, dst int16, src int16, tmp int16, off int64) {
+       p := s.Prog(mov)
+       p.From.Type = obj.TYPE_MEM
+       p.From.Reg = src
+       p.From.Offset = off
+       p.To.Type = obj.TYPE_REG
+       p.To.Reg = tmp
+
+       p1 := s.Prog(mov)
+       p1.From.Type = obj.TYPE_REG
+       p1.From.Reg = tmp
+       p1.To.Type = obj.TYPE_MEM
+       p1.To.Reg = dst
+       p1.To.Offset = off
+
+       return
+}
index 93828777954118167857c6a3703964e771f23067..e14de328ea47d3c6776d78821106d7c43711ee75 100644 (file)
        (MOVHstore [4] dst (MOVHload [4] src mem)
                (MOVHstore [2] dst (MOVHload [2] src mem)
                        (MOVHstore dst (MOVHload src mem) mem)))
-(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
-       (MOVWstore [8] dst (MOVWload [8] src mem)
-               (MOVWstore [4] dst (MOVWload [4] src mem)
-                       (MOVWstore dst (MOVWload src mem) mem)))
-(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 =>
-       (MOVDstore [8] dst (MOVDload [8] src mem)
-               (MOVDstore dst (MOVDload src mem) mem))
-(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 =>
-       (MOVDstore [16] dst (MOVDload [16] src mem)
-               (MOVDstore [8] dst (MOVDload [8] src mem)
-                       (MOVDstore dst (MOVDload src mem) mem)))
-(Move [32] {t} dst src mem) && t.Alignment()%8 == 0 =>
-       (MOVDstore [24] dst (MOVDload [24] src mem)
-               (MOVDstore [16] dst (MOVDload [16] src mem)
-                       (MOVDstore [8] dst (MOVDload [8] src mem)
-                               (MOVDstore dst (MOVDload src mem) mem))))
-
-// Medium 8-aligned move uses a Duff's device
-// 16 and 128 are magic constants, see runtime/mkduff.go
-(Move [s] {t} dst src mem)
-       && s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0
+
+// Generic move
+(Move [s] {t} dst src mem) && s > 0 && s <= 3*8*moveSize(t.Alignment(), config)
        && logLargeCopy(v, s) =>
-       (DUFFCOPY [16 * (128 - s/8)] dst src mem)
+       (LoweredMove [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem)
 
 // Generic move uses a loop
-(Move [s] {t} dst src mem) && (s <= 16 || logLargeCopy(v, s)) =>
-       (LoweredMove [t.Alignment()]
-               dst
-               src
-               (ADDI <src.Type> [s-moveSize(t.Alignment(), config)] src)
-               mem)
+(Move [s] {t} dst src mem) && s > 3*8*moveSize(t.Alignment(), config)
+       && logLargeCopy(v, s) =>
+       (LoweredMoveLoop [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem)
 
 // Boolean ops; 0=false, 1=true
 (AndB ...) => (AND ...)
index 8e2f85b8d70f5bbafd71b345040d8f1443353634..6507a227088394920511c5e26a3409e32f31b1ee 100644 (file)
@@ -117,6 +117,7 @@ func init() {
 
        regCtxt := regNamed["X26"]
        callerSave := gpMask | fpMask | regNamed["g"]
+       r5toR6 := regNamed["X5"] | regNamed["X6"]
 
        var (
                gpstore  = regInfo{inputs: []regMask{gpspsbMask, gpspMask, 0}} // SB in first input so we can load from a global, but not in second to avoid using SB as a temporary register
@@ -354,27 +355,51 @@ func init() {
                },
 
                // general unaligned move
-               // arg0 = address of dst memory (in X5, changed as side effect)
-               // arg1 = address of src memory (in X6, changed as side effect)
-               // arg2 = address of the last element of src (can't be X7 as we clobber it before using arg2)
+               // arg0 = address of dst memory (clobber)
+               // arg1 = address of src memory (clobber)
+               // arg2 = mem
+               // auxint = size and type alignment
+               // returns mem
+               //      mov     (offset)(Rarg1), TMP
+               //      mov     TMP, (offset)(Rarg0)
+               {
+                       name:      "LoweredMove",
+                       aux:       "SymValAndOff",
+                       symEffect: "Write",
+                       argLength: 3,
+                       reg: regInfo{
+                               inputs:   []regMask{gpMask &^ regNamed["X5"], gpMask &^ regNamed["X5"]},
+                               clobbers: regNamed["X5"],
+                       },
+                       faultOnNilArg0: true,
+                       faultOnNilArg1: true,
+               },
+
+               // general unaligned move
+               // arg0 = address of dst memory (clobber)
+               // arg1 = address of src memory (clobber)
                // arg3 = mem
                // auxint = alignment
-               // clobbers X7 as a tmp register.
                // returns mem
-               //      mov     (X6), X7
-               //      mov     X7, (X5)
-               //      ADD     $sz, X5
                //      ADD     $sz, X6
-               //      BGEU    Rarg2, X5, -4(PC)
+               //loop:
+               //      mov     (Rarg1), X5
+               //      mov     X5, (Rarg0)
+               //      ...rest 7 mov...
+               //      ADD     $sz, Rarg0
+               //      ADD     $sz, Rarg1
+               //      BNE     X6, Rarg1, loop
                {
-                       name:      "LoweredMove",
-                       aux:       "Int64",
-                       argLength: 4,
+                       name:      "LoweredMoveLoop",
+                       aux:       "SymValAndOff",
+                       argLength: 3,
+                       symEffect: "Write",
                        reg: regInfo{
-                               inputs:   []regMask{regNamed["X5"], regNamed["X6"], gpMask &^ regNamed["X7"]},
-                               clobbers: regNamed["X5"] | regNamed["X6"] | regNamed["X7"],
+                               inputs:       []regMask{gpMask &^ r5toR6, gpMask &^ r5toR6},
+                               clobbers:     r5toR6,
+                               clobbersArg0: true,
+                               clobbersArg1: true,
                        },
-                       typ:            "Mem",
                        faultOnNilArg0: true,
                        faultOnNilArg1: true,
                },
index 5f9572d6752386d0096f64fb2eadb04a449d68fe..592e99f327f397713975d3a53e05741f4035fa00 100644 (file)
@@ -2571,6 +2571,7 @@ const (
        OpRISCV64LoweredZero
        OpRISCV64LoweredZeroLoop
        OpRISCV64LoweredMove
+       OpRISCV64LoweredMoveLoop
        OpRISCV64LoweredAtomicLoad8
        OpRISCV64LoweredAtomicLoad32
        OpRISCV64LoweredAtomicLoad64
@@ -34585,17 +34586,34 @@ var opcodeTable = [...]opInfo{
        },
        {
                name:           "LoweredMove",
-               auxType:        auxInt64,
-               argLen:         4,
+               auxType:        auxSymValAndOff,
+               argLen:         3,
                faultOnNilArg0: true,
                faultOnNilArg1: true,
+               symEffect:      SymWrite,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 16},         // X5
-                               {1, 32},         // X6
-                               {2, 1006632880}, // X5 X6 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+                               {0, 1006632928}, // X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+                               {1, 1006632928}, // X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
                        },
-                       clobbers: 112, // X5 X6 X7
+                       clobbers: 16, // X5
+               },
+       },
+       {
+               name:           "LoweredMoveLoop",
+               auxType:        auxSymValAndOff,
+               argLen:         3,
+               faultOnNilArg0: true,
+               faultOnNilArg1: true,
+               symEffect:      SymWrite,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 1006632896}, // X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+                               {1, 1006632896}, // X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+                       },
+                       clobbers:     48, // X5 X6
+                       clobbersArg0: true,
+                       clobbersArg1: true,
                },
        },
        {
index faa465b9db977a3e78a0947980428f47d2916c5a..5723327bc920eb8381d39a4a0e24967acc597621 100644 (file)
@@ -3090,169 +3090,38 @@ func rewriteValueRISCV64_OpMove(v *Value) bool {
                v.AddArg3(dst, v0, v1)
                return true
        }
-       // match: (Move [12] {t} dst src mem)
-       // cond: t.Alignment()%4 == 0
-       // result: (MOVWstore [8] dst (MOVWload [8] src mem) (MOVWstore [4] dst (MOVWload [4] src mem) (MOVWstore dst (MOVWload src mem) mem)))
-       for {
-               if auxIntToInt64(v.AuxInt) != 12 {
-                       break
-               }
-               t := auxToType(v.Aux)
-               dst := v_0
-               src := v_1
-               mem := v_2
-               if !(t.Alignment()%4 == 0) {
-                       break
-               }
-               v.reset(OpRISCV64MOVWstore)
-               v.AuxInt = int32ToAuxInt(8)
-               v0 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
-               v0.AuxInt = int32ToAuxInt(8)
-               v0.AddArg2(src, mem)
-               v1 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
-               v1.AuxInt = int32ToAuxInt(4)
-               v2 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
-               v2.AuxInt = int32ToAuxInt(4)
-               v2.AddArg2(src, mem)
-               v3 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
-               v4 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
-               v4.AddArg2(src, mem)
-               v3.AddArg3(dst, v4, mem)
-               v1.AddArg3(dst, v2, v3)
-               v.AddArg3(dst, v0, v1)
-               return true
-       }
-       // match: (Move [16] {t} dst src mem)
-       // cond: t.Alignment()%8 == 0
-       // result: (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem))
-       for {
-               if auxIntToInt64(v.AuxInt) != 16 {
-                       break
-               }
-               t := auxToType(v.Aux)
-               dst := v_0
-               src := v_1
-               mem := v_2
-               if !(t.Alignment()%8 == 0) {
-                       break
-               }
-               v.reset(OpRISCV64MOVDstore)
-               v.AuxInt = int32ToAuxInt(8)
-               v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
-               v0.AuxInt = int32ToAuxInt(8)
-               v0.AddArg2(src, mem)
-               v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
-               v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
-               v2.AddArg2(src, mem)
-               v1.AddArg3(dst, v2, mem)
-               v.AddArg3(dst, v0, v1)
-               return true
-       }
-       // match: (Move [24] {t} dst src mem)
-       // cond: t.Alignment()%8 == 0
-       // result: (MOVDstore [16] dst (MOVDload [16] src mem) (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem)))
-       for {
-               if auxIntToInt64(v.AuxInt) != 24 {
-                       break
-               }
-               t := auxToType(v.Aux)
-               dst := v_0
-               src := v_1
-               mem := v_2
-               if !(t.Alignment()%8 == 0) {
-                       break
-               }
-               v.reset(OpRISCV64MOVDstore)
-               v.AuxInt = int32ToAuxInt(16)
-               v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
-               v0.AuxInt = int32ToAuxInt(16)
-               v0.AddArg2(src, mem)
-               v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
-               v1.AuxInt = int32ToAuxInt(8)
-               v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
-               v2.AuxInt = int32ToAuxInt(8)
-               v2.AddArg2(src, mem)
-               v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
-               v4 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
-               v4.AddArg2(src, mem)
-               v3.AddArg3(dst, v4, mem)
-               v1.AddArg3(dst, v2, v3)
-               v.AddArg3(dst, v0, v1)
-               return true
-       }
-       // match: (Move [32] {t} dst src mem)
-       // cond: t.Alignment()%8 == 0
-       // result: (MOVDstore [24] dst (MOVDload [24] src mem) (MOVDstore [16] dst (MOVDload [16] src mem) (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem))))
-       for {
-               if auxIntToInt64(v.AuxInt) != 32 {
-                       break
-               }
-               t := auxToType(v.Aux)
-               dst := v_0
-               src := v_1
-               mem := v_2
-               if !(t.Alignment()%8 == 0) {
-                       break
-               }
-               v.reset(OpRISCV64MOVDstore)
-               v.AuxInt = int32ToAuxInt(24)
-               v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
-               v0.AuxInt = int32ToAuxInt(24)
-               v0.AddArg2(src, mem)
-               v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
-               v1.AuxInt = int32ToAuxInt(16)
-               v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
-               v2.AuxInt = int32ToAuxInt(16)
-               v2.AddArg2(src, mem)
-               v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
-               v3.AuxInt = int32ToAuxInt(8)
-               v4 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
-               v4.AuxInt = int32ToAuxInt(8)
-               v4.AddArg2(src, mem)
-               v5 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
-               v6 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
-               v6.AddArg2(src, mem)
-               v5.AddArg3(dst, v6, mem)
-               v3.AddArg3(dst, v4, v5)
-               v1.AddArg3(dst, v2, v3)
-               v.AddArg3(dst, v0, v1)
-               return true
-       }
        // match: (Move [s] {t} dst src mem)
-       // cond: s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 && logLargeCopy(v, s)
-       // result: (DUFFCOPY [16 * (128 - s/8)] dst src mem)
+       // cond: s > 0 && s <= 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s)
+       // result: (LoweredMove [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem)
        for {
                s := auxIntToInt64(v.AuxInt)
                t := auxToType(v.Aux)
                dst := v_0
                src := v_1
                mem := v_2
-               if !(s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 && logLargeCopy(v, s)) {
+               if !(s > 0 && s <= 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s)) {
                        break
                }
-               v.reset(OpRISCV64DUFFCOPY)
-               v.AuxInt = int64ToAuxInt(16 * (128 - s/8))
+               v.reset(OpRISCV64LoweredMove)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(s), int32(t.Alignment())))
                v.AddArg3(dst, src, mem)
                return true
        }
        // match: (Move [s] {t} dst src mem)
-       // cond: (s <= 16 || logLargeCopy(v, s))
-       // result: (LoweredMove [t.Alignment()] dst src (ADDI <src.Type> [s-moveSize(t.Alignment(), config)] src) mem)
+       // cond: s > 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s)
+       // result: (LoweredMoveLoop [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem)
        for {
                s := auxIntToInt64(v.AuxInt)
                t := auxToType(v.Aux)
                dst := v_0
                src := v_1
                mem := v_2
-               if !(s <= 16 || logLargeCopy(v, s)) {
+               if !(s > 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s)) {
                        break
                }
-               v.reset(OpRISCV64LoweredMove)
-               v.AuxInt = int64ToAuxInt(t.Alignment())
-               v0 := b.NewValue0(v.Pos, OpRISCV64ADDI, src.Type)
-               v0.AuxInt = int64ToAuxInt(s - moveSize(t.Alignment(), config))
-               v0.AddArg(src)
-               v.AddArg4(dst, src, v0, mem)
+               v.reset(OpRISCV64LoweredMoveLoop)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(s), int32(t.Alignment())))
+               v.AddArg3(dst, src, mem)
                return true
        }
        return false