From 4dac9e093ff520af08f0b83f53b4fabac8db5321 Mon Sep 17 00:00:00 2001 From: Meng Zhuo Date: Wed, 3 Sep 2025 09:55:56 +0800 Subject: [PATCH] cmd/compile: use generated loops instead of DUFFCOPY on riscv64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit MemmoveKnownSize112-4 632.1Mi ± 1% 1288.5Mi ± 0% +103.85% (p=0.000 n=10) MemmoveKnownSize128-4 636.1Mi ± 0% 1280.9Mi ± 1% +101.36% (p=0.000 n=10) MemmoveKnownSize192-4 645.3Mi ± 0% 1306.9Mi ± 1% +102.53% (p=0.000 n=10) MemmoveKnownSize248-4 650.2Mi ± 2% 1312.5Mi ± 1% +101.87% (p=0.000 n=10) MemmoveKnownSize256-4 650.7Mi ± 0% 1303.6Mi ± 1% +100.33% (p=0.000 n=10) MemmoveKnownSize512-4 658.2Mi ± 1% 1293.9Mi ± 0% +96.60% (p=0.000 n=10) MemmoveKnownSize1024-4 662.1Mi ± 0% 1312.6Mi ± 0% +98.26% (p=0.000 n=10) Change-Id: I43681ca029880025558b33ddc4295da3947c9b28 Reviewed-on: https://go-review.googlesource.com/c/go/+/700537 LUCI-TryBot-Result: Go LUCI Reviewed-by: Keith Randall Reviewed-by: Keith Randall Reviewed-by: Mark Freeman --- src/cmd/compile/internal/riscv64/ssa.go | 139 ++++++++++++---- .../compile/internal/ssa/_gen/RISCV64.rules | 35 +--- .../compile/internal/ssa/_gen/RISCV64Ops.go | 53 ++++-- src/cmd/compile/internal/ssa/opGen.go | 30 +++- .../compile/internal/ssa/rewriteRISCV64.go | 153 ++---------------- 5 files changed, 187 insertions(+), 223 deletions(-) diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go index da28197490..43a1ec4b61 100644 --- a/src/cmd/compile/internal/riscv64/ssa.go +++ b/src/cmd/compile/internal/riscv64/ssa.go @@ -822,44 +822,99 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { } case ssa.OpRISCV64LoweredMove: - mov, sz := largestMove(v.AuxInt) + dst := v.Args[0].Reg() + src := v.Args[1].Reg() + if dst == src { + break + } - // mov (Rarg1), T2 - // mov T2, (Rarg0) - // ADD $sz, Rarg0 - // ADD $sz, Rarg1 - // BGEU Rarg2, Rarg0, -4(PC) + sa := v.AuxValAndOff() + n := sa.Val64() + mov, sz := largestMove(sa.Off64()) - p := s.Prog(mov) - p.From.Type = obj.TYPE_MEM - p.From.Reg = v.Args[1].Reg() + var off int64 + tmp := int16(riscv.REG_X5) + for n >= sz { + moveOp(s, mov, dst, src, tmp, off) + off += sz + n -= sz + } + + for i := len(fracMovOps) - 1; i >= 0; i-- { + tsz := int64(1 << i) + if n < tsz { + continue + } + moveOp(s, fracMovOps[i], dst, src, tmp, off) + off += tsz + n -= tsz + } + + case ssa.OpRISCV64LoweredMoveLoop: + dst := v.Args[0].Reg() + src := v.Args[1].Reg() + if dst == src { + break + } + + sc := v.AuxValAndOff() + n := sc.Val64() + mov, sz := largestMove(sc.Off64()) + chunk := 8 * sz + + if n <= 3*chunk { + v.Fatalf("MoveLoop too small:%d, expect:%d", n, 3*chunk) + } + tmp := int16(riscv.REG_X5) + + p := s.Prog(riscv.AADD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = n - n%chunk + p.Reg = src p.To.Type = obj.TYPE_REG - p.To.Reg = riscv.REG_T2 + p.To.Reg = riscv.REG_X6 - p2 := s.Prog(mov) - p2.From.Type = obj.TYPE_REG - p2.From.Reg = riscv.REG_T2 - p2.To.Type = obj.TYPE_MEM - p2.To.Reg = v.Args[0].Reg() - - p3 := s.Prog(riscv.AADD) - p3.From.Type = obj.TYPE_CONST - p3.From.Offset = sz - p3.To.Type = obj.TYPE_REG - p3.To.Reg = v.Args[0].Reg() - - p4 := s.Prog(riscv.AADD) - p4.From.Type = obj.TYPE_CONST - p4.From.Offset = sz - p4.To.Type = obj.TYPE_REG - p4.To.Reg = v.Args[1].Reg() + for i := int64(0); i < 8; i++ { + moveOp(s, mov, dst, src, tmp, sz*i) + } - p5 := s.Prog(riscv.ABGEU) - p5.To.Type = obj.TYPE_BRANCH - p5.Reg = v.Args[1].Reg() - p5.From.Type = obj.TYPE_REG - p5.From.Reg = v.Args[2].Reg() - p5.To.SetTarget(p) + p1 := s.Prog(riscv.AADD) + p1.From.Type = obj.TYPE_CONST + p1.From.Offset = chunk + p1.To.Type = obj.TYPE_REG + p1.To.Reg = src + + p2 := s.Prog(riscv.AADD) + p2.From.Type = obj.TYPE_CONST + p2.From.Offset = chunk + p2.To.Type = obj.TYPE_REG + p2.To.Reg = dst + + p3 := s.Prog(riscv.ABNE) + p3.From.Reg = riscv.REG_X6 + p3.From.Type = obj.TYPE_REG + p3.Reg = src + p3.To.Type = obj.TYPE_BRANCH + p3.To.SetTarget(p.Link) + + n %= chunk + + var off int64 + for n >= sz { + moveOp(s, mov, dst, src, tmp, off) + off += sz + n -= sz + } + + for i := len(fracMovOps) - 1; i >= 0; i-- { + tsz := int64(1 << i) + if n < tsz { + continue + } + moveOp(s, fracMovOps[i], dst, src, tmp, off) + off += tsz + n -= tsz + } case ssa.OpRISCV64LoweredNilCheck: // Issue a load which will fault if arg is nil. @@ -1023,3 +1078,21 @@ func zeroOp(s *ssagen.State, mov obj.As, reg int16, off int64) { p.To.Offset = off return } + +func moveOp(s *ssagen.State, mov obj.As, dst int16, src int16, tmp int16, off int64) { + p := s.Prog(mov) + p.From.Type = obj.TYPE_MEM + p.From.Reg = src + p.From.Offset = off + p.To.Type = obj.TYPE_REG + p.To.Reg = tmp + + p1 := s.Prog(mov) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = tmp + p1.To.Type = obj.TYPE_MEM + p1.To.Reg = dst + p1.To.Offset = off + + return +} diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules index 9382877795..e14de328ea 100644 --- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules @@ -442,37 +442,16 @@ (MOVHstore [4] dst (MOVHload [4] src mem) (MOVHstore [2] dst (MOVHload [2] src mem) (MOVHstore dst (MOVHload src mem) mem))) -(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 => - (MOVWstore [8] dst (MOVWload [8] src mem) - (MOVWstore [4] dst (MOVWload [4] src mem) - (MOVWstore dst (MOVWload src mem) mem))) -(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 => - (MOVDstore [8] dst (MOVDload [8] src mem) - (MOVDstore dst (MOVDload src mem) mem)) -(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 => - (MOVDstore [16] dst (MOVDload [16] src mem) - (MOVDstore [8] dst (MOVDload [8] src mem) - (MOVDstore dst (MOVDload src mem) mem))) -(Move [32] {t} dst src mem) && t.Alignment()%8 == 0 => - (MOVDstore [24] dst (MOVDload [24] src mem) - (MOVDstore [16] dst (MOVDload [16] src mem) - (MOVDstore [8] dst (MOVDload [8] src mem) - (MOVDstore dst (MOVDload src mem) mem)))) - -// Medium 8-aligned move uses a Duff's device -// 16 and 128 are magic constants, see runtime/mkduff.go -(Move [s] {t} dst src mem) - && s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 + +// Generic move +(Move [s] {t} dst src mem) && s > 0 && s <= 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s) => - (DUFFCOPY [16 * (128 - s/8)] dst src mem) + (LoweredMove [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem) // Generic move uses a loop -(Move [s] {t} dst src mem) && (s <= 16 || logLargeCopy(v, s)) => - (LoweredMove [t.Alignment()] - dst - src - (ADDI [s-moveSize(t.Alignment(), config)] src) - mem) +(Move [s] {t} dst src mem) && s > 3*8*moveSize(t.Alignment(), config) + && logLargeCopy(v, s) => + (LoweredMoveLoop [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem) // Boolean ops; 0=false, 1=true (AndB ...) => (AND ...) diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go index 8e2f85b8d7..6507a22708 100644 --- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go @@ -117,6 +117,7 @@ func init() { regCtxt := regNamed["X26"] callerSave := gpMask | fpMask | regNamed["g"] + r5toR6 := regNamed["X5"] | regNamed["X6"] var ( gpstore = regInfo{inputs: []regMask{gpspsbMask, gpspMask, 0}} // SB in first input so we can load from a global, but not in second to avoid using SB as a temporary register @@ -354,27 +355,51 @@ func init() { }, // general unaligned move - // arg0 = address of dst memory (in X5, changed as side effect) - // arg1 = address of src memory (in X6, changed as side effect) - // arg2 = address of the last element of src (can't be X7 as we clobber it before using arg2) + // arg0 = address of dst memory (clobber) + // arg1 = address of src memory (clobber) + // arg2 = mem + // auxint = size and type alignment + // returns mem + // mov (offset)(Rarg1), TMP + // mov TMP, (offset)(Rarg0) + { + name: "LoweredMove", + aux: "SymValAndOff", + symEffect: "Write", + argLength: 3, + reg: regInfo{ + inputs: []regMask{gpMask &^ regNamed["X5"], gpMask &^ regNamed["X5"]}, + clobbers: regNamed["X5"], + }, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // general unaligned move + // arg0 = address of dst memory (clobber) + // arg1 = address of src memory (clobber) // arg3 = mem // auxint = alignment - // clobbers X7 as a tmp register. // returns mem - // mov (X6), X7 - // mov X7, (X5) - // ADD $sz, X5 // ADD $sz, X6 - // BGEU Rarg2, X5, -4(PC) + //loop: + // mov (Rarg1), X5 + // mov X5, (Rarg0) + // ...rest 7 mov... + // ADD $sz, Rarg0 + // ADD $sz, Rarg1 + // BNE X6, Rarg1, loop { - name: "LoweredMove", - aux: "Int64", - argLength: 4, + name: "LoweredMoveLoop", + aux: "SymValAndOff", + argLength: 3, + symEffect: "Write", reg: regInfo{ - inputs: []regMask{regNamed["X5"], regNamed["X6"], gpMask &^ regNamed["X7"]}, - clobbers: regNamed["X5"] | regNamed["X6"] | regNamed["X7"], + inputs: []regMask{gpMask &^ r5toR6, gpMask &^ r5toR6}, + clobbers: r5toR6, + clobbersArg0: true, + clobbersArg1: true, }, - typ: "Mem", faultOnNilArg0: true, faultOnNilArg1: true, }, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 5f9572d675..592e99f327 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2571,6 +2571,7 @@ const ( OpRISCV64LoweredZero OpRISCV64LoweredZeroLoop OpRISCV64LoweredMove + OpRISCV64LoweredMoveLoop OpRISCV64LoweredAtomicLoad8 OpRISCV64LoweredAtomicLoad32 OpRISCV64LoweredAtomicLoad64 @@ -34585,17 +34586,34 @@ var opcodeTable = [...]opInfo{ }, { name: "LoweredMove", - auxType: auxInt64, - argLen: 4, + auxType: auxSymValAndOff, + argLen: 3, faultOnNilArg0: true, faultOnNilArg1: true, + symEffect: SymWrite, reg: regInfo{ inputs: []inputInfo{ - {0, 16}, // X5 - {1, 32}, // X6 - {2, 1006632880}, // X5 X6 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 + {0, 1006632928}, // X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 + {1, 1006632928}, // X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 }, - clobbers: 112, // X5 X6 X7 + clobbers: 16, // X5 + }, + }, + { + name: "LoweredMoveLoop", + auxType: auxSymValAndOff, + argLen: 3, + faultOnNilArg0: true, + faultOnNilArg1: true, + symEffect: SymWrite, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1006632896}, // X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 + {1, 1006632896}, // X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 + }, + clobbers: 48, // X5 X6 + clobbersArg0: true, + clobbersArg1: true, }, }, { diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go index faa465b9db..5723327bc9 100644 --- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go +++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go @@ -3090,169 +3090,38 @@ func rewriteValueRISCV64_OpMove(v *Value) bool { v.AddArg3(dst, v0, v1) return true } - // match: (Move [12] {t} dst src mem) - // cond: t.Alignment()%4 == 0 - // result: (MOVWstore [8] dst (MOVWload [8] src mem) (MOVWstore [4] dst (MOVWload [4] src mem) (MOVWstore dst (MOVWload src mem) mem))) - for { - if auxIntToInt64(v.AuxInt) != 12 { - break - } - t := auxToType(v.Aux) - dst := v_0 - src := v_1 - mem := v_2 - if !(t.Alignment()%4 == 0) { - break - } - v.reset(OpRISCV64MOVWstore) - v.AuxInt = int32ToAuxInt(8) - v0 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32) - v0.AuxInt = int32ToAuxInt(8) - v0.AddArg2(src, mem) - v1 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem) - v1.AuxInt = int32ToAuxInt(4) - v2 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32) - v2.AuxInt = int32ToAuxInt(4) - v2.AddArg2(src, mem) - v3 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem) - v4 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32) - v4.AddArg2(src, mem) - v3.AddArg3(dst, v4, mem) - v1.AddArg3(dst, v2, v3) - v.AddArg3(dst, v0, v1) - return true - } - // match: (Move [16] {t} dst src mem) - // cond: t.Alignment()%8 == 0 - // result: (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem)) - for { - if auxIntToInt64(v.AuxInt) != 16 { - break - } - t := auxToType(v.Aux) - dst := v_0 - src := v_1 - mem := v_2 - if !(t.Alignment()%8 == 0) { - break - } - v.reset(OpRISCV64MOVDstore) - v.AuxInt = int32ToAuxInt(8) - v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64) - v0.AuxInt = int32ToAuxInt(8) - v0.AddArg2(src, mem) - v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64) - v2.AddArg2(src, mem) - v1.AddArg3(dst, v2, mem) - v.AddArg3(dst, v0, v1) - return true - } - // match: (Move [24] {t} dst src mem) - // cond: t.Alignment()%8 == 0 - // result: (MOVDstore [16] dst (MOVDload [16] src mem) (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem))) - for { - if auxIntToInt64(v.AuxInt) != 24 { - break - } - t := auxToType(v.Aux) - dst := v_0 - src := v_1 - mem := v_2 - if !(t.Alignment()%8 == 0) { - break - } - v.reset(OpRISCV64MOVDstore) - v.AuxInt = int32ToAuxInt(16) - v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64) - v0.AuxInt = int32ToAuxInt(16) - v0.AddArg2(src, mem) - v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v1.AuxInt = int32ToAuxInt(8) - v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64) - v2.AuxInt = int32ToAuxInt(8) - v2.AddArg2(src, mem) - v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v4 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64) - v4.AddArg2(src, mem) - v3.AddArg3(dst, v4, mem) - v1.AddArg3(dst, v2, v3) - v.AddArg3(dst, v0, v1) - return true - } - // match: (Move [32] {t} dst src mem) - // cond: t.Alignment()%8 == 0 - // result: (MOVDstore [24] dst (MOVDload [24] src mem) (MOVDstore [16] dst (MOVDload [16] src mem) (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem)))) - for { - if auxIntToInt64(v.AuxInt) != 32 { - break - } - t := auxToType(v.Aux) - dst := v_0 - src := v_1 - mem := v_2 - if !(t.Alignment()%8 == 0) { - break - } - v.reset(OpRISCV64MOVDstore) - v.AuxInt = int32ToAuxInt(24) - v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64) - v0.AuxInt = int32ToAuxInt(24) - v0.AddArg2(src, mem) - v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v1.AuxInt = int32ToAuxInt(16) - v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64) - v2.AuxInt = int32ToAuxInt(16) - v2.AddArg2(src, mem) - v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v3.AuxInt = int32ToAuxInt(8) - v4 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64) - v4.AuxInt = int32ToAuxInt(8) - v4.AddArg2(src, mem) - v5 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v6 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64) - v6.AddArg2(src, mem) - v5.AddArg3(dst, v6, mem) - v3.AddArg3(dst, v4, v5) - v1.AddArg3(dst, v2, v3) - v.AddArg3(dst, v0, v1) - return true - } // match: (Move [s] {t} dst src mem) - // cond: s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 && logLargeCopy(v, s) - // result: (DUFFCOPY [16 * (128 - s/8)] dst src mem) + // cond: s > 0 && s <= 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s) + // result: (LoweredMove [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem) for { s := auxIntToInt64(v.AuxInt) t := auxToType(v.Aux) dst := v_0 src := v_1 mem := v_2 - if !(s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 && logLargeCopy(v, s)) { + if !(s > 0 && s <= 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s)) { break } - v.reset(OpRISCV64DUFFCOPY) - v.AuxInt = int64ToAuxInt(16 * (128 - s/8)) + v.reset(OpRISCV64LoweredMove) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(s), int32(t.Alignment()))) v.AddArg3(dst, src, mem) return true } // match: (Move [s] {t} dst src mem) - // cond: (s <= 16 || logLargeCopy(v, s)) - // result: (LoweredMove [t.Alignment()] dst src (ADDI [s-moveSize(t.Alignment(), config)] src) mem) + // cond: s > 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s) + // result: (LoweredMoveLoop [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem) for { s := auxIntToInt64(v.AuxInt) t := auxToType(v.Aux) dst := v_0 src := v_1 mem := v_2 - if !(s <= 16 || logLargeCopy(v, s)) { + if !(s > 3*8*moveSize(t.Alignment(), config) && logLargeCopy(v, s)) { break } - v.reset(OpRISCV64LoweredMove) - v.AuxInt = int64ToAuxInt(t.Alignment()) - v0 := b.NewValue0(v.Pos, OpRISCV64ADDI, src.Type) - v0.AuxInt = int64ToAuxInt(s - moveSize(t.Alignment(), config)) - v0.AddArg(src) - v.AddArg4(dst, src, v0, mem) + v.reset(OpRISCV64LoweredMoveLoop) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(s), int32(t.Alignment()))) + v.AddArg3(dst, src, mem) return true } return false -- 2.52.0