From 150d2448e5a213cd679396371c0a147918dc2125 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Micha=C5=82=20Derkacz?= Date: Sun, 14 Jun 2020 00:06:24 +0200 Subject: [PATCH] cmd/compile,cmd/internal/obj/riscv,runtime: use Duff's devices on riscv64 Implement runtime.duffzero and runtime.duffcopy for riscv64. Use obj.ADUFFZERO/obj.ADUFFCOPY for medium size, word aligned zeroing/moving. Change-Id: I42ec622055630c94cb77e286d8d33dbe7c9f846c Reviewed-on: https://go-review.googlesource.com/c/go/+/237797 Run-TryBot: Cherry Zhang Reviewed-by: Joel Sing Reviewed-by: Cherry Zhang --- src/cmd/compile/internal/riscv64/ggen.go | 10 +- src/cmd/compile/internal/riscv64/ssa.go | 14 + .../compile/internal/ssa/gen/RISCV64.rules | 14 + .../compile/internal/ssa/gen/RISCV64Ops.go | 38 + src/cmd/compile/internal/ssa/opGen.go | 28 + .../compile/internal/ssa/rewriteRISCV64.go | 33 + src/cmd/internal/obj/riscv/obj.go | 8 +- src/runtime/duff_riscv64.s | 907 ++++++++++++++++++ src/runtime/mkduff.go | 28 + 9 files changed, 1076 insertions(+), 4 deletions(-) create mode 100644 src/runtime/duff_riscv64.s diff --git a/src/cmd/compile/internal/riscv64/ggen.go b/src/cmd/compile/internal/riscv64/ggen.go index be31fad441..f7c03fe7c2 100644 --- a/src/cmd/compile/internal/riscv64/ggen.go +++ b/src/cmd/compile/internal/riscv64/ggen.go @@ -25,7 +25,15 @@ func zeroRange(pp *gc.Progs, p *obj.Prog, off, cnt int64, _ *uint32) *obj.Prog { return p } - // TODO(jsing): Add a duff zero implementation for medium sized ranges. + if cnt <= int64(128*gc.Widthptr) { + p = pp.Appendpp(p, riscv.AADDI, obj.TYPE_CONST, 0, off, obj.TYPE_REG, riscv.REG_A0, 0) + p.Reg = riscv.REG_SP + p = pp.Appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_MEM, 0, 0) + p.To.Name = obj.NAME_EXTERN + p.To.Sym = gc.Duffzero + p.To.Offset = 8 * (128 - cnt/int64(gc.Widthptr)) + return p + } // Loop, zeroing pointer width bytes at a time. // ADD $(off), SP, T0 diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go index 064a1ca111..0beb5b4bd1 100644 --- a/src/cmd/compile/internal/riscv64/ssa.go +++ b/src/cmd/compile/internal/riscv64/ssa.go @@ -608,6 +608,20 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() + case ssa.OpRISCV64DUFFZERO: + p := s.Prog(obj.ADUFFZERO) + p.To.Type = obj.TYPE_MEM + p.To.Name = obj.NAME_EXTERN + p.To.Sym = gc.Duffzero + p.To.Offset = v.AuxInt + + case ssa.OpRISCV64DUFFCOPY: + p := s.Prog(obj.ADUFFCOPY) + p.To.Type = obj.TYPE_MEM + p.To.Name = obj.NAME_EXTERN + p.To.Sym = gc.Duffcopy + p.To.Offset = v.AuxInt + default: v.Fatalf("Unhandled op %v", v.Op) } diff --git a/src/cmd/compile/internal/ssa/gen/RISCV64.rules b/src/cmd/compile/internal/ssa/gen/RISCV64.rules index 3bc2e8498a..325cbeb825 100644 --- a/src/cmd/compile/internal/ssa/gen/RISCV64.rules +++ b/src/cmd/compile/internal/ssa/gen/RISCV64.rules @@ -360,6 +360,13 @@ (Zero [4] ptr mem) => (MOVWstore ptr (MOVWconst) mem) (Zero [8] ptr mem) => (MOVDstore ptr (MOVDconst) mem) +// Medium zeroing uses a Duff's device +// 8 and 128 are magic constants, see runtime/mkduff.go +(Zero [s] {t} ptr mem) + && s%8 == 0 && s >= 16 && s <= 8*128 + && t.Alignment()%8 == 0 && !config.noDuffDevice => + (DUFFZERO [8 * (128 - s/8)] ptr mem) + // Generic zeroing uses a loop (Zero [s] {t} ptr mem) => (LoweredZero [t.Alignment()] @@ -395,6 +402,13 @@ (Move [4] dst src mem) => (MOVWstore dst (MOVWload src mem) mem) (Move [8] dst src mem) => (MOVDstore dst (MOVDload src mem) mem) +// Medium move uses a Duff's device +// 16 and 128 are magic constants, see runtime/mkduff.go +(Move [s] {t} dst src mem) + && s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0 + && !config.noDuffDevice && logLargeCopy(v, s) => + (DUFFCOPY [16 * (128 - s/8)] dst src mem) + // Generic move uses a loop (Move [s] {t} dst src mem) && (s <= 16 || logLargeCopy(v, s)) => (LoweredMove [t.Alignment()] diff --git a/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go index ebd515b7fc..f64319230b 100644 --- a/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go @@ -240,6 +240,44 @@ func init() { {name: "CALLclosure", argLength: 3, reg: callClosure, aux: "CallOff", call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem {name: "CALLinter", argLength: 2, reg: callInter, aux: "CallOff", call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem + // duffzero + // arg0 = address of memory to zero (in X10, changed as side effect) + // arg1 = mem + // auxint = offset into duffzero code to start executing + // X1 (link register) changed because of function call + // returns mem + { + name: "DUFFZERO", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{regNamed["X10"]}, + clobbers: regNamed["X1"] | regNamed["X10"], + }, + typ: "Mem", + faultOnNilArg0: true, + }, + + // duffcopy + // arg0 = address of dst memory (in X11, changed as side effect) + // arg1 = address of src memory (in X10, changed as side effect) + // arg2 = mem + // auxint = offset into duffcopy code to start executing + // X1 (link register) changed because of function call + // returns mem + { + name: "DUFFCOPY", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{regNamed["X11"], regNamed["X10"]}, + clobbers: regNamed["X1"] | regNamed["X10"] | regNamed["X11"], + }, + typ: "Mem", + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + // Generic moves and zeros // general unaligned zeroing diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index bb1cbc0baa..96aa3adedd 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2111,6 +2111,8 @@ const ( OpRISCV64CALLstatic OpRISCV64CALLclosure OpRISCV64CALLinter + OpRISCV64DUFFZERO + OpRISCV64DUFFCOPY OpRISCV64LoweredZero OpRISCV64LoweredMove OpRISCV64LoweredAtomicLoad8 @@ -28162,6 +28164,32 @@ var opcodeTable = [...]opInfo{ clobbers: 9223372035781033972, // X3 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 g X28 X29 X30 F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 }, }, + { + name: "DUFFZERO", + auxType: auxInt64, + argLen: 2, + faultOnNilArg0: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 512}, // X10 + }, + clobbers: 512, // X10 + }, + }, + { + name: "DUFFCOPY", + auxType: auxInt64, + argLen: 3, + faultOnNilArg0: true, + faultOnNilArg1: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1024}, // X11 + {1, 512}, // X10 + }, + clobbers: 1536, // X10 X11 + }, + }, { name: "LoweredZero", auxType: auxInt64, diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go index ac92945753..c337ffbfe7 100644 --- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go +++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go @@ -2017,6 +2017,23 @@ func rewriteValueRISCV64_OpMove(v *Value) bool { return true } // match: (Move [s] {t} dst src mem) + // cond: s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice && logLargeCopy(v, s) + // result: (DUFFCOPY [16 * (128 - s/8)] dst src mem) + for { + s := auxIntToInt64(v.AuxInt) + t := auxToType(v.Aux) + dst := v_0 + src := v_1 + mem := v_2 + if !(s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice && logLargeCopy(v, s)) { + break + } + v.reset(OpRISCV64DUFFCOPY) + v.AuxInt = int64ToAuxInt(16 * (128 - s/8)) + v.AddArg3(dst, src, mem) + return true + } + // match: (Move [s] {t} dst src mem) // cond: (s <= 16 || logLargeCopy(v, s)) // result: (LoweredMove [t.Alignment()] dst src (ADDI [s-moveSize(t.Alignment(), config)] src) mem) for { @@ -5650,6 +5667,22 @@ func rewriteValueRISCV64_OpZero(v *Value) bool { return true } // match: (Zero [s] {t} ptr mem) + // cond: s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice + // result: (DUFFZERO [8 * (128 - s/8)] ptr mem) + for { + s := auxIntToInt64(v.AuxInt) + t := auxToType(v.Aux) + ptr := v_0 + mem := v_1 + if !(s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice) { + break + } + v.reset(OpRISCV64DUFFZERO) + v.AuxInt = int64ToAuxInt(8 * (128 - s/8)) + v.AddArg2(ptr, mem) + return true + } + // match: (Zero [s] {t} ptr mem) // result: (LoweredZero [t.Alignment()] ptr (ADD ptr (MOVDconst [s-moveSize(t.Alignment(), config)])) mem) for { s := auxIntToInt64(v.AuxInt) diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go index da49f67138..5301e44002 100644 --- a/src/cmd/internal/obj/riscv/obj.go +++ b/src/cmd/internal/obj/riscv/obj.go @@ -33,7 +33,7 @@ func buildop(ctxt *obj.Link) {} // lr is the link register to use for the JALR. // p must be a CALL, JMP or RET. func jalrToSym(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc, lr int16) *obj.Prog { - if p.As != obj.ACALL && p.As != obj.AJMP && p.As != obj.ARET { + if p.As != obj.ACALL && p.As != obj.AJMP && p.As != obj.ARET && p.As != obj.ADUFFZERO && p.As != obj.ADUFFCOPY { ctxt.Diag("unexpected Prog in jalrToSym: %v", p) return p } @@ -417,7 +417,7 @@ func containsCall(sym *obj.LSym) bool { // CALLs are CALL or JAL(R) with link register LR. for p := sym.Func().Text; p != nil; p = p.Link { switch p.As { - case obj.ACALL: + case obj.ACALL, obj.ADUFFZERO, obj.ADUFFCOPY: return true case AJAL, AJALR: if p.From.Type == obj.TYPE_REG && p.From.Reg == REG_LR { @@ -656,7 +656,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { p.From.Reg = REG_SP } - case obj.ACALL: + case obj.ACALL, obj.ADUFFZERO, obj.ADUFFCOPY: switch p.To.Type { case obj.TYPE_MEM: jalrToSym(ctxt, p, newprog, REG_LR) @@ -1696,6 +1696,8 @@ var encodings = [ALAST & obj.AMask]encoding{ obj.APCDATA: pseudoOpEncoding, obj.ATEXT: pseudoOpEncoding, obj.ANOP: pseudoOpEncoding, + obj.ADUFFZERO: pseudoOpEncoding, + obj.ADUFFCOPY: pseudoOpEncoding, } // encodingForAs returns the encoding for an obj.As. diff --git a/src/runtime/duff_riscv64.s b/src/runtime/duff_riscv64.s new file mode 100644 index 0000000000..f7bd3f326e --- /dev/null +++ b/src/runtime/duff_riscv64.s @@ -0,0 +1,907 @@ +// Code generated by mkduff.go; DO NOT EDIT. +// Run go generate from src/runtime to update. +// See mkduff.go for comments. + +#include "textflag.h" + +TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + MOV ZERO, (X10) + ADD $8, X10 + RET + +TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0 + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + MOV (X10), X31 + ADD $8, X10 + MOV X31, (X11) + ADD $8, X11 + + RET diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go index 8859ed68cc..6ddf0256e9 100644 --- a/src/runtime/mkduff.go +++ b/src/runtime/mkduff.go @@ -38,6 +38,7 @@ func main() { gen("arm64", notags, zeroARM64, copyARM64) gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x) gen("mips64x", tagsMIPS64x, zeroMIPS64x, copyMIPS64x) + gen("riscv64", notags, zeroRISCV64, copyRISCV64) } func gen(arch string, tags, zero, copy func(io.Writer)) { @@ -227,3 +228,30 @@ func copyMIPS64x(w io.Writer) { } fmt.Fprintln(w, "\tRET") } + +func zeroRISCV64(w io.Writer) { + // ZERO: always zero + // X10: ptr to memory to be zeroed + // X10 is updated as a side effect. + fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0") + for i := 0; i < 128; i++ { + fmt.Fprintln(w, "\tMOV\tZERO, (X10)") + fmt.Fprintln(w, "\tADD\t$8, X10") + } + fmt.Fprintln(w, "\tRET") +} + +func copyRISCV64(w io.Writer) { + // X10: ptr to source memory + // X11: ptr to destination memory + // X10 and X11 are updated as a side effect + fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0") + for i := 0; i < 128; i++ { + fmt.Fprintln(w, "\tMOV\t(X10), X31") + fmt.Fprintln(w, "\tADD\t$8, X10") + fmt.Fprintln(w, "\tMOV\tX31, (X11)") + fmt.Fprintln(w, "\tADD\t$8, X11") + fmt.Fprintln(w) + } + fmt.Fprintln(w, "\tRET") +} -- 2.50.0