From: Meng Zhuo Date: Tue, 6 Mar 2018 02:47:09 +0000 (+0000) Subject: runtime, cmd/compile: use ldp for DUFFCOPY on ARM64 X-Git-Tag: go1.11beta1~1336 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=8916773a3d6faabed5d500f8bf0d89b7048aecde;p=gostls13.git runtime, cmd/compile: use ldp for DUFFCOPY on ARM64 name old time/op new time/op delta CopyFat8 2.15ns ± 1% 2.19ns ± 6% ~ (p=0.171 n=8+9) CopyFat12 2.15ns ± 0% 2.17ns ± 2% ~ (p=0.137 n=8+10) CopyFat16 2.17ns ± 3% 2.15ns ± 0% ~ (p=0.211 n=10+10) CopyFat24 2.16ns ± 1% 2.15ns ± 0% ~ (p=0.087 n=10+10) CopyFat32 11.5ns ± 0% 12.8ns ± 2% +10.87% (p=0.000 n=8+10) CopyFat64 20.2ns ± 2% 12.9ns ± 0% -36.11% (p=0.000 n=10+10) CopyFat128 37.2ns ± 0% 21.5ns ± 0% -42.20% (p=0.000 n=10+10) CopyFat256 71.6ns ± 0% 38.7ns ± 0% -45.95% (p=0.000 n=10+10) CopyFat512 140ns ± 0% 73ns ± 0% -47.86% (p=0.000 n=10+9) CopyFat520 142ns ± 0% 74ns ± 0% -47.54% (p=0.000 n=10+10) CopyFat1024 277ns ± 0% 141ns ± 0% -49.10% (p=0.000 n=10+10) Change-Id: If54bc571add5db674d5e081579c87e80153d0a5a Reviewed-on: https://go-review.googlesource.com/97395 Reviewed-by: Cherry Zhang --- diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules index 7b31d46c5a..263ccba548 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules @@ -462,11 +462,21 @@ (Move [s-s%8] dst src mem)) // medium move uses a duff device -// 8 and 128 are magic constants, see runtime/mkduff.go (Move [s] dst src mem) - && s%8 == 0 && s > 24 && s <= 8*128 + && s > 32 && s <= 16*64 && s%16 == 8 && !config.noDuffDevice -> - (DUFFCOPY [8 * (128 - s/8)] dst src mem) + (MOVDstore [s-8] dst (MOVDload [s-8] src mem) + (DUFFCOPY [8*(64-(s-8)/16)] dst src mem)) +(Move [s] dst src mem) + && s > 32 && s <= 16*64 && s%16 == 0 + && !config.noDuffDevice -> + (DUFFCOPY [8 * (64 - s/16)] dst src mem) +// 8 is the number of bytes to encode: +// +// LDP.P 16(R16), (R26, R27) +// STP.P (R26, R27), 16(R17) +// +// 64 is number of these blocks. See runtime/duff_arm64.s:duffcopy // large move uses a loop (Move [s] dst src mem) diff --git a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go index 14b1c9afa4..008be3c47e 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go @@ -426,7 +426,7 @@ func init() { argLength: 3, reg: regInfo{ inputs: []regMask{buildReg("R17"), buildReg("R16")}, - clobbers: buildReg("R16 R17 R30"), + clobbers: buildReg("R16 R17 R26 R30"), }, faultOnNilArg0: true, faultOnNilArg1: true, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index d074861544..48ec74b3d2 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -14656,7 +14656,7 @@ var opcodeTable = [...]opInfo{ {0, 131072}, // R17 {1, 65536}, // R16 }, - clobbers: 537067520, // R16 R17 R30 + clobbers: 604176384, // R16 R17 R26 R30 }, }, { diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 10a7a4cb05..d7ade01de4 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -17371,19 +17371,47 @@ func rewriteValueARM64_OpMove_10(v *Value) bool { return true } // match: (Move [s] dst src mem) - // cond: s%8 == 0 && s > 24 && s <= 8*128 && !config.noDuffDevice - // result: (DUFFCOPY [8 * (128 - s/8)] dst src mem) + // cond: s > 32 && s <= 16*64 && s%16 == 8 && !config.noDuffDevice + // result: (MOVDstore [s-8] dst (MOVDload [s-8] src mem) (DUFFCOPY [8*(64-(s-8)/16)] dst src mem)) for { s := v.AuxInt _ = v.Args[2] dst := v.Args[0] src := v.Args[1] mem := v.Args[2] - if !(s%8 == 0 && s > 24 && s <= 8*128 && !config.noDuffDevice) { + if !(s > 32 && s <= 16*64 && s%16 == 8 && !config.noDuffDevice) { + break + } + v.reset(OpARM64MOVDstore) + v.AuxInt = s - 8 + v.AddArg(dst) + v0 := b.NewValue0(v.Pos, OpARM64MOVDload, typ.UInt64) + v0.AuxInt = s - 8 + v0.AddArg(src) + v0.AddArg(mem) + v.AddArg(v0) + v1 := b.NewValue0(v.Pos, OpARM64DUFFCOPY, types.TypeMem) + v1.AuxInt = 8 * (64 - (s-8)/16) + v1.AddArg(dst) + v1.AddArg(src) + v1.AddArg(mem) + v.AddArg(v1) + return true + } + // match: (Move [s] dst src mem) + // cond: s > 32 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice + // result: (DUFFCOPY [8 * (64 - s/16)] dst src mem) + for { + s := v.AuxInt + _ = v.Args[2] + dst := v.Args[0] + src := v.Args[1] + mem := v.Args[2] + if !(s > 32 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice) { break } v.reset(OpARM64DUFFCOPY) - v.AuxInt = 8 * (128 - s/8) + v.AuxInt = 8 * (64 - s/16) v.AddArg(dst) v.AddArg(src) v.AddArg(mem) diff --git a/src/runtime/duff_arm64.s b/src/runtime/duff_arm64.s index 54e6b9967e..3739c3945a 100644 --- a/src/runtime/duff_arm64.s +++ b/src/runtime/duff_arm64.s @@ -71,389 +71,197 @@ TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0 STP (ZR, ZR), (R16) RET -TEXT runtime·duffcopy(SB), NOSPLIT, $0-0 - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) +TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0 + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) - - MOVD.P 8(R16), R27 - MOVD.P R27, 8(R17) + LDP.P 16(R16), (R26, R27) + STP.P (R26, R27), 16(R17) RET diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go index 62de604e69..b490cd815f 100644 --- a/src/runtime/memmove_test.go +++ b/src/runtime/memmove_test.go @@ -450,6 +450,13 @@ func BenchmarkCopyFat512(b *testing.B) { _ = y } } +func BenchmarkCopyFat520(b *testing.B) { + var x [520 / 4]uint32 + for i := 0; i < b.N; i++ { + y := x + _ = y + } +} func BenchmarkCopyFat1024(b *testing.B) { var x [1024 / 4]uint32 for i := 0; i < b.N; i++ { diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go index fcc7f83197..b7c7e2689c 100644 --- a/src/runtime/mkduff.go +++ b/src/runtime/mkduff.go @@ -164,12 +164,13 @@ func zeroARM64(w io.Writer) { func copyARM64(w io.Writer) { // R16 (aka REGRT1): ptr to source memory // R17 (aka REGRT2): ptr to destination memory - // R27 (aka REGTMP): scratch space + // R26, R27 (aka REGTMP): scratch space // R16 and R17 are updated as a side effect - fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0") - for i := 0; i < 128; i++ { - fmt.Fprintln(w, "\tMOVD.P\t8(R16), R27") - fmt.Fprintln(w, "\tMOVD.P\tR27, 8(R17)") + fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0") + + for i := 0; i < 64; i++ { + fmt.Fprintln(w, "\tLDP.P\t16(R16), (R26, R27)") + fmt.Fprintln(w, "\tSTP.P\t(R26, R27), 16(R17)") fmt.Fprintln(w) } fmt.Fprintln(w, "\tRET")