From: Meng Zhuo Date: Thu, 28 Aug 2025 07:05:27 +0000 (+0000) Subject: cmd/compile: use generated loops instead of DUFFZERO on riscv64 X-Git-Tag: go1.26rc1~920 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=879ff736d3;p=gostls13.git cmd/compile: use generated loops instead of DUFFZERO on riscv64 MemclrKnownSize112-4 5.602Gi ± 0% 5.601Gi ± 0% ~ (p=0.363 n=10) MemclrKnownSize128-4 6.933Gi ± 1% 6.545Gi ± 1% -5.59% (p=0.000 n=10) MemclrKnownSize192-4 8.055Gi ± 1% 7.804Gi ± 0% -3.12% (p=0.000 n=10) MemclrKnownSize248-4 8.489Gi ± 0% 8.718Gi ± 0% +2.69% (p=0.000 n=10) MemclrKnownSize256-4 8.762Gi ± 0% 8.763Gi ± 0% ~ (p=0.494 n=10) MemclrKnownSize512-4 9.514Gi ± 1% 9.514Gi ± 0% ~ (p=0.529 n=10) MemclrKnownSize1024-4 9.940Gi ± 0% 9.939Gi ± 1% ~ (p=0.989 n=10) ClearFat3-4 1.300Gi ± 0% 1.301Gi ± 0% ~ (p=0.447 n=10) ClearFat4-4 3.902Gi ± 0% 3.902Gi ± 0% ~ (p=0.971 n=10) ClearFat5-4 665.8Mi ± 0% 1331.5Mi ± 0% +100.01% (p=0.000 n=10) ClearFat6-4 665.8Mi ± 0% 1330.5Mi ± 0% +99.82% (p=0.000 n=10) ClearFat7-4 490.7Mi ± 0% 1331.9Mi ± 0% +171.45% (p=0.000 n=10) ClearFat8-4 5.201Gi ± 0% 5.202Gi ± 0% ~ (p=0.123 n=10) ClearFat9-4 856.1Mi ± 0% 1331.6Mi ± 0% +55.54% (p=0.000 n=10) ClearFat10-4 887.8Mi ± 0% 1331.9Mi ± 0% +50.03% (p=0.000 n=10) ClearFat11-4 915.3Mi ± 0% 1331.1Mi ± 0% +45.42% (p=0.000 n=10) ClearFat12-4 5.202Gi ± 0% 5.202Gi ± 0% ~ (p=0.481 n=10) ClearFat13-4 961.5Mi ± 0% 1331.8Mi ± 0% +38.50% (p=0.000 n=10) ClearFat14-4 981.0Mi ± 0% 1331.8Mi ± 0% +35.76% (p=0.000 n=10) ClearFat15-4 951.3Mi ± 0% 1331.4Mi ± 0% +39.96% (p=0.000 n=10) ClearFat16-4 1.600Gi ± 0% 5.202Gi ± 0% +225.10% (p=0.000 n=10) ClearFat18-4 1.018Gi ± 0% 1.300Gi ± 0% +27.77% (p=0.000 n=10) ClearFat20-4 2.601Gi ± 0% 4.938Gi ± 12% +89.87% (p=0.000 n=10) ClearFat24-4 2.601Gi ± 0% 5.201Gi ± 0% +99.96% (p=0.000 n=10) ClearFat32-4 1.982Gi ± 0% 5.203Gi ± 0% +162.55% (p=0.000 n=10) ClearFat40-4 3.467Gi ± 0% 4.338Gi ± 0% +25.11% (p=0.000 n=10) ClearFat48-4 3.671Gi ± 0% 5.201Gi ± 0% +41.69% (p=0.000 n=10) ClearFat56-4 3.640Gi ± 0% 5.201Gi ± 0% +42.88% (p=0.000 n=10) ClearFat64-4 2.250Gi ± 0% 5.202Gi ± 0% +131.25% (p=0.000 n=10) ClearFat72-4 4.064Gi ± 0% 5.201Gi ± 0% +27.97% (p=0.000 n=10) ClearFat128-4 4.496Gi ± 0% 5.203Gi ± 0% +15.71% (p=0.000 n=10) ClearFat256-4 4.756Gi ± 0% 5.201Gi ± 0% +9.36% (p=0.000 n=10) ClearFat512-4 2.512Gi ± 0% 5.201Gi ± 0% +107.03% (p=0.000 n=10) ClearFat1024-4 4.255Gi ± 0% 5.202Gi ± 0% +22.26% (p=0.000 n=10) ClearFat1032-4 4.260Gi ± 0% 5.201Gi ± 0% +22.09% (p=0.000 n=10) ClearFat1040-4 4.285Gi ± 1% 5.203Gi ± 0% +21.41% (p=0.000 n=10) geomean 2.005Gi 3.020Gi +50.58% Change-Id: Iea1da734ff8eaf1b5a2822ae2bdb7f4fd9b65651 Reviewed-on: https://go-review.googlesource.com/c/go/+/699635 Reviewed-by: Mark Ryan Reviewed-by: Keith Randall Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI Reviewed-by: Mark Freeman --- diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go index 88733b0d64..da28197490 100644 --- a/src/cmd/compile/internal/riscv64/ssa.go +++ b/src/cmd/compile/internal/riscv64/ssa.go @@ -181,6 +181,8 @@ func largestMove(alignment int64) (obj.As, int64) { } } +var fracMovOps = []obj.As{riscv.AMOVB, riscv.AMOVH, riscv.AMOVW, riscv.AMOV} + // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags. // RISC-V has no flags, so this is a no-op. func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {} @@ -738,30 +740,86 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.RegTo2 = riscv.REG_ZERO case ssa.OpRISCV64LoweredZero: - mov, sz := largestMove(v.AuxInt) + ptr := v.Args[0].Reg() + sc := v.AuxValAndOff() + n := sc.Val64() + + mov, sz := largestMove(sc.Off64()) + + // mov ZERO, (offset)(Rarg0) + var off int64 + for n >= sz { + zeroOp(s, mov, ptr, off) + off += sz + n -= sz + } - // mov ZERO, (Rarg0) - // ADD $sz, Rarg0 - // BGEU Rarg1, Rarg0, -2(PC) + for i := len(fracMovOps) - 1; i >= 0; i-- { + tsz := int64(1 << i) + if n < tsz { + continue + } + zeroOp(s, fracMovOps[i], ptr, off) + off += tsz + n -= tsz + } - p := s.Prog(mov) - p.From.Type = obj.TYPE_REG - p.From.Reg = riscv.REG_ZERO - p.To.Type = obj.TYPE_MEM - p.To.Reg = v.Args[0].Reg() + case ssa.OpRISCV64LoweredZeroLoop: + ptr := v.Args[0].Reg() + sc := v.AuxValAndOff() + n := sc.Val64() + mov, sz := largestMove(sc.Off64()) + chunk := 8 * sz + + if n <= 3*chunk { + v.Fatalf("ZeroLoop too small:%d, expect:%d", n, 3*chunk) + } + + tmp := v.RegTmp() + + p := s.Prog(riscv.AADD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = n - n%chunk + p.Reg = ptr + p.To.Type = obj.TYPE_REG + p.To.Reg = tmp + + for i := int64(0); i < 8; i++ { + zeroOp(s, mov, ptr, sz*i) + } p2 := s.Prog(riscv.AADD) p2.From.Type = obj.TYPE_CONST - p2.From.Offset = sz + p2.From.Offset = chunk p2.To.Type = obj.TYPE_REG - p2.To.Reg = v.Args[0].Reg() + p2.To.Reg = ptr - p3 := s.Prog(riscv.ABGEU) - p3.To.Type = obj.TYPE_BRANCH - p3.Reg = v.Args[0].Reg() + p3 := s.Prog(riscv.ABNE) + p3.From.Reg = tmp p3.From.Type = obj.TYPE_REG - p3.From.Reg = v.Args[1].Reg() - p3.To.SetTarget(p) + p3.Reg = ptr + p3.To.Type = obj.TYPE_BRANCH + p3.To.SetTarget(p.Link) + + n %= chunk + + // mov ZERO, (offset)(Rarg0) + var off int64 + for n >= sz { + zeroOp(s, mov, ptr, off) + off += sz + n -= sz + } + + for i := len(fracMovOps) - 1; i >= 0; i-- { + tsz := int64(1 << i) + if n < tsz { + continue + } + zeroOp(s, fracMovOps[i], ptr, off) + off += tsz + n -= tsz + } case ssa.OpRISCV64LoweredMove: mov, sz := largestMove(v.AuxInt) @@ -955,3 +1013,13 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in p.Pos = p.Pos.WithNotStmt() return p } + +func zeroOp(s *ssagen.State, mov obj.As, reg int16, off int64) { + p := s.Prog(mov) + p.From.Type = obj.TYPE_REG + p.From.Reg = riscv.REG_ZERO + p.To.Type = obj.TYPE_MEM + p.To.Reg = reg + p.To.Offset = off + return +} diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules index 821f822746..9382877795 100644 --- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules @@ -373,36 +373,14 @@ (MOVHstore [4] ptr (MOVDconst [0]) (MOVHstore [2] ptr (MOVDconst [0]) (MOVHstore ptr (MOVDconst [0]) mem))) -(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 => - (MOVWstore [8] ptr (MOVDconst [0]) - (MOVWstore [4] ptr (MOVDconst [0]) - (MOVWstore ptr (MOVDconst [0]) mem))) -(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 => - (MOVDstore [8] ptr (MOVDconst [0]) - (MOVDstore ptr (MOVDconst [0]) mem)) -(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 => - (MOVDstore [16] ptr (MOVDconst [0]) - (MOVDstore [8] ptr (MOVDconst [0]) - (MOVDstore ptr (MOVDconst [0]) mem))) -(Zero [32] {t} ptr mem) && t.Alignment()%8 == 0 => - (MOVDstore [24] ptr (MOVDconst [0]) - (MOVDstore [16] ptr (MOVDconst [0]) - (MOVDstore [8] ptr (MOVDconst [0]) - (MOVDstore ptr (MOVDconst [0]) mem)))) - -// Medium 8-aligned zeroing uses a Duff's device -// 8 and 128 are magic constants, see runtime/mkduff.go -(Zero [s] {t} ptr mem) - && s%8 == 0 && s <= 8*128 - && t.Alignment()%8 == 0 => - (DUFFZERO [8 * (128 - s/8)] ptr mem) + +// Unroll zeroing in medium size (at most 192 bytes i.e. 3 cachelines) +(Zero [s] {t} ptr mem) && s <= 24*moveSize(t.Alignment(), config) => + (LoweredZero [makeValAndOff(int32(s),int32(t.Alignment()))] ptr mem) // Generic zeroing uses a loop -(Zero [s] {t} ptr mem) => - (LoweredZero [t.Alignment()] - ptr - (ADD ptr (MOVDconst [s-moveSize(t.Alignment(), config)])) - mem) +(Zero [s] {t} ptr mem) && s > 24*moveSize(t.Alignment(), config) => + (LoweredZeroLoop [makeValAndOff(int32(s),int32(t.Alignment()))] ptr mem) // Checks (IsNonNil ...) => (SNEZ ...) diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go index 0bccaf63bc..8e2f85b8d7 100644 --- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go @@ -317,25 +317,40 @@ func init() { // Generic moves and zeros - // general unaligned zeroing - // arg0 = address of memory to zero (in X5, changed as side effect) - // arg1 = address of the last element to zero (inclusive) - // arg2 = mem - // auxint = element size + // general unrolled zeroing + // arg0 = address of memory to zero + // arg1 = mem + // auxint = element size and type alignment // returns mem - // mov ZERO, (X5) - // ADD $sz, X5 - // BGEU Rarg1, X5, -2(PC) + // mov ZERO, (OFFSET)(Rarg0) { - name: "LoweredZero", - aux: "Int64", - argLength: 3, + name: "LoweredZero", + aux: "SymValAndOff", + typ: "Mem", + argLength: 2, + symEffect: "Write", + faultOnNilArg0: true, reg: regInfo{ - inputs: []regMask{regNamed["X5"], gpMask}, - clobbers: regNamed["X5"], + inputs: []regMask{gpMask}, }, + }, + // general unaligned zeroing + // arg0 = address of memory to zero (clobber) + // arg2 = mem + // auxint = element size and type alignment + // returns mem + { + name: "LoweredZeroLoop", + aux: "SymValAndOff", typ: "Mem", + argLength: 2, + symEffect: "Write", + needIntTemp: true, faultOnNilArg0: true, + reg: regInfo{ + inputs: []regMask{gpMask}, + clobbersArg0: true, + }, }, // general unaligned move diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 60ac188e1e..5f9572d675 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2569,6 +2569,7 @@ const ( OpRISCV64DUFFZERO OpRISCV64DUFFCOPY OpRISCV64LoweredZero + OpRISCV64LoweredZeroLoop OpRISCV64LoweredMove OpRISCV64LoweredAtomicLoad8 OpRISCV64LoweredAtomicLoad32 @@ -34558,15 +34559,28 @@ var opcodeTable = [...]opInfo{ }, { name: "LoweredZero", - auxType: auxInt64, - argLen: 3, + auxType: auxSymValAndOff, + argLen: 2, faultOnNilArg0: true, + symEffect: SymWrite, reg: regInfo{ inputs: []inputInfo{ - {0, 16}, // X5 - {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 + {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 + }, + }, + }, + { + name: "LoweredZeroLoop", + auxType: auxSymValAndOff, + argLen: 2, + needIntTemp: true, + faultOnNilArg0: true, + symEffect: SymWrite, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 }, - clobbers: 16, // X5 + clobbersArg0: true, }, }, { diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go index e2c400b0c5..faa465b9db 100644 --- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go +++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go @@ -9925,138 +9925,39 @@ func rewriteValueRISCV64_OpZero(v *Value) bool { v.AddArg3(ptr, v0, v1) return true } - // match: (Zero [12] {t} ptr mem) - // cond: t.Alignment()%4 == 0 - // result: (MOVWstore [8] ptr (MOVDconst [0]) (MOVWstore [4] ptr (MOVDconst [0]) (MOVWstore ptr (MOVDconst [0]) mem))) - for { - if auxIntToInt64(v.AuxInt) != 12 { - break - } - t := auxToType(v.Aux) - ptr := v_0 - mem := v_1 - if !(t.Alignment()%4 == 0) { - break - } - v.reset(OpRISCV64MOVWstore) - v.AuxInt = int32ToAuxInt(8) - v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) - v0.AuxInt = int64ToAuxInt(0) - v1 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem) - v1.AuxInt = int32ToAuxInt(4) - v2 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem) - v2.AddArg3(ptr, v0, mem) - v1.AddArg3(ptr, v0, v2) - v.AddArg3(ptr, v0, v1) - return true - } - // match: (Zero [16] {t} ptr mem) - // cond: t.Alignment()%8 == 0 - // result: (MOVDstore [8] ptr (MOVDconst [0]) (MOVDstore ptr (MOVDconst [0]) mem)) - for { - if auxIntToInt64(v.AuxInt) != 16 { - break - } - t := auxToType(v.Aux) - ptr := v_0 - mem := v_1 - if !(t.Alignment()%8 == 0) { - break - } - v.reset(OpRISCV64MOVDstore) - v.AuxInt = int32ToAuxInt(8) - v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) - v0.AuxInt = int64ToAuxInt(0) - v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v1.AddArg3(ptr, v0, mem) - v.AddArg3(ptr, v0, v1) - return true - } - // match: (Zero [24] {t} ptr mem) - // cond: t.Alignment()%8 == 0 - // result: (MOVDstore [16] ptr (MOVDconst [0]) (MOVDstore [8] ptr (MOVDconst [0]) (MOVDstore ptr (MOVDconst [0]) mem))) - for { - if auxIntToInt64(v.AuxInt) != 24 { - break - } - t := auxToType(v.Aux) - ptr := v_0 - mem := v_1 - if !(t.Alignment()%8 == 0) { - break - } - v.reset(OpRISCV64MOVDstore) - v.AuxInt = int32ToAuxInt(16) - v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) - v0.AuxInt = int64ToAuxInt(0) - v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v1.AuxInt = int32ToAuxInt(8) - v2 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v2.AddArg3(ptr, v0, mem) - v1.AddArg3(ptr, v0, v2) - v.AddArg3(ptr, v0, v1) - return true - } - // match: (Zero [32] {t} ptr mem) - // cond: t.Alignment()%8 == 0 - // result: (MOVDstore [24] ptr (MOVDconst [0]) (MOVDstore [16] ptr (MOVDconst [0]) (MOVDstore [8] ptr (MOVDconst [0]) (MOVDstore ptr (MOVDconst [0]) mem)))) - for { - if auxIntToInt64(v.AuxInt) != 32 { - break - } - t := auxToType(v.Aux) - ptr := v_0 - mem := v_1 - if !(t.Alignment()%8 == 0) { - break - } - v.reset(OpRISCV64MOVDstore) - v.AuxInt = int32ToAuxInt(24) - v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) - v0.AuxInt = int64ToAuxInt(0) - v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v1.AuxInt = int32ToAuxInt(16) - v2 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v2.AuxInt = int32ToAuxInt(8) - v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem) - v3.AddArg3(ptr, v0, mem) - v2.AddArg3(ptr, v0, v3) - v1.AddArg3(ptr, v0, v2) - v.AddArg3(ptr, v0, v1) - return true - } // match: (Zero [s] {t} ptr mem) - // cond: s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 - // result: (DUFFZERO [8 * (128 - s/8)] ptr mem) + // cond: s <= 24*moveSize(t.Alignment(), config) + // result: (LoweredZero [makeValAndOff(int32(s),int32(t.Alignment()))] ptr mem) for { s := auxIntToInt64(v.AuxInt) t := auxToType(v.Aux) ptr := v_0 mem := v_1 - if !(s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0) { + if !(s <= 24*moveSize(t.Alignment(), config)) { break } - v.reset(OpRISCV64DUFFZERO) - v.AuxInt = int64ToAuxInt(8 * (128 - s/8)) + v.reset(OpRISCV64LoweredZero) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(s), int32(t.Alignment()))) v.AddArg2(ptr, mem) return true } // match: (Zero [s] {t} ptr mem) - // result: (LoweredZero [t.Alignment()] ptr (ADD ptr (MOVDconst [s-moveSize(t.Alignment(), config)])) mem) + // cond: s > 24*moveSize(t.Alignment(), config) + // result: (LoweredZeroLoop [makeValAndOff(int32(s),int32(t.Alignment()))] ptr mem) for { s := auxIntToInt64(v.AuxInt) t := auxToType(v.Aux) ptr := v_0 mem := v_1 - v.reset(OpRISCV64LoweredZero) - v.AuxInt = int64ToAuxInt(t.Alignment()) - v0 := b.NewValue0(v.Pos, OpRISCV64ADD, ptr.Type) - v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) - v1.AuxInt = int64ToAuxInt(s - moveSize(t.Alignment(), config)) - v0.AddArg2(ptr, v1) - v.AddArg3(ptr, v0, mem) + if !(s > 24*moveSize(t.Alignment(), config)) { + break + } + v.reset(OpRISCV64LoweredZeroLoop) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(s), int32(t.Alignment()))) + v.AddArg2(ptr, mem) return true } + return false } func rewriteBlockRISCV64(b *Block) bool { typ := &b.Func.Config.Types