From: Keith Randall Date: Thu, 5 Jun 2025 00:14:01 +0000 (-0700) Subject: cmd/compile: use generated loops instead of DUFFZERO on arm64 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=28aa529c998f114f08ac57cfe94fbf3dd7813f00;p=gostls13.git cmd/compile: use generated loops instead of DUFFZERO on arm64 Change-Id: Ie0c8263f36d1bcfd0edfc4ea6710ae6c113c4d48 Reviewed-on: https://go-review.googlesource.com/c/go/+/678995 Reviewed-by: Keith Randall Reviewed-by: Jorropo LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Knyszek --- diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index be7887318a..cd0c2cdfaa 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -1050,33 +1050,118 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.From.Offset = int64(condCode) p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() - case ssa.OpARM64DUFFZERO: - // runtime.duffzero expects start address in R20 - p := s.Prog(obj.ADUFFZERO) - p.To.Type = obj.TYPE_MEM - p.To.Name = obj.NAME_EXTERN - p.To.Sym = ir.Syms.Duffzero - p.To.Offset = v.AuxInt case ssa.OpARM64LoweredZero: - // STP.P (ZR,ZR), 16(R16) - // CMP Rarg1, R16 - // BLE -2(PC) - // arg1 is the address of the last 16-byte unit to zero - p := s.Prog(arm64.ASTP) - p.Scond = arm64.C_XPOST - p.From.Type = obj.TYPE_REGREG - p.From.Reg = arm64.REGZERO - p.From.Offset = int64(arm64.REGZERO) - p.To.Type = obj.TYPE_MEM - p.To.Reg = arm64.REG_R16 - p.To.Offset = 16 - p2 := s.Prog(arm64.ACMP) - p2.From.Type = obj.TYPE_REG - p2.From.Reg = v.Args[1].Reg() - p2.Reg = arm64.REG_R16 - p3 := s.Prog(arm64.ABLE) - p3.To.Type = obj.TYPE_BRANCH - p3.To.SetTarget(p) + ptrReg := v.Args[0].Reg() + n := v.AuxInt + if n < 16 { + v.Fatalf("Zero too small %d", n) + } + + // Generate zeroing instructions. + var off int64 + for n >= 16 { + // STP (ZR, ZR), off(ptrReg) + zero16(s, ptrReg, off, false) + off += 16 + n -= 16 + } + // Write any fractional portion. + // An overlapping 16-byte write can't be used here + // because STP's offsets must be a multiple of 8. + if n > 8 { + // MOVD ZR, off(ptrReg) + zero8(s, ptrReg, off) + off += 8 + n -= 8 + } + if n != 0 { + // MOVD ZR, off+n-8(ptrReg) + // TODO: for n<=4 we could use a smaller write. + zero8(s, ptrReg, off+n-8) + } + case ssa.OpARM64LoweredZeroLoop: + ptrReg := v.Args[0].Reg() + countReg := v.RegTmp() + n := v.AuxInt + loopSize := int64(64) + if n < 3*loopSize { + // - a loop count of 0 won't work. + // - a loop count of 1 is useless. + // - a loop count of 2 is a code size ~tie + // 3 instructions to implement the loop + // 4 instructions in the loop body + // vs + // 8 instructions in the straightline code + // Might as well use straightline code. + v.Fatalf("ZeroLoop size too small %d", n) + } + + // Put iteration count in a register. + // MOVD $n, countReg + p := s.Prog(arm64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = n / loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg + cntInit := p + + // Zero loopSize bytes starting at ptrReg. + // Increment ptrReg by loopSize as a side effect. + for range loopSize / 16 { + // STP.P (ZR, ZR), 16(ptrReg) + zero16(s, ptrReg, 0, true) + // TODO: should we use the postincrement form, + // or use a separate += 64 instruction? + // postincrement saves an instruction, but maybe + // it requires more integer units to do the +=16s. + } + // Decrement loop count. + // SUB $1, countReg + p = s.Prog(arm64.ASUB) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 1 + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg + // Jump to loop header if we're not done yet. + // CBNZ head + p = s.Prog(arm64.ACBNZ) + p.From.Type = obj.TYPE_REG + p.From.Reg = countReg + p.To.Type = obj.TYPE_BRANCH + p.To.SetTarget(cntInit.Link) + + // Multiples of the loop size are now done. + n %= loopSize + + // Write any fractional portion. + var off int64 + for n >= 16 { + // STP (ZR, ZR), off(ptrReg) + zero16(s, ptrReg, off, false) + off += 16 + n -= 16 + } + if n > 8 { + // Note: an overlapping 16-byte write can't be used + // here because STP's offsets must be a multiple of 8. + // MOVD ZR, off(ptrReg) + zero8(s, ptrReg, off) + off += 8 + n -= 8 + } + if n != 0 { + // MOVD ZR, off+n-8(ptrReg) + // TODO: for n<=4 we could use a smaller write. + zero8(s, ptrReg, off+n-8) + } + // TODO: maybe we should use the count register to instead + // hold an end pointer and compare against that? + // ADD $n, ptrReg, endReg + // then + // CMP ptrReg, endReg + // BNE loop + // There's a past-the-end pointer here, any problem with that? + case ssa.OpARM64DUFFCOPY: p := s.Prog(obj.ADUFFCOPY) p.To.Type = obj.TYPE_MEM @@ -1482,3 +1567,35 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in p.Pos = p.Pos.WithNotStmt() return p } + +// zero16 zeroes 16 bytes at reg+off. +// If postInc is true, increment reg by 16. +func zero16(s *ssagen.State, reg int16, off int64, postInc bool) { + // STP (ZR, ZR), off(reg) + p := s.Prog(arm64.ASTP) + p.From.Type = obj.TYPE_REGREG + p.From.Reg = arm64.REGZERO + p.From.Offset = int64(arm64.REGZERO) + p.To.Type = obj.TYPE_MEM + p.To.Reg = reg + p.To.Offset = off + if postInc { + if off != 0 { + panic("can't postinc with non-zero offset") + } + // STP.P (ZR, ZR), 16(reg) + p.Scond = arm64.C_XPOST + p.To.Offset = 16 + } +} + +// zero8 zeroes 8 bytes at reg+off. +func zero8(s *ssagen.State, reg int16, off int64) { + // MOVD ZR, off(reg) + p := s.Prog(arm64.AMOVD) + p.From.Type = obj.TYPE_REG + p.From.Reg = arm64.REGZERO + p.To.Type = obj.TYPE_MEM + p.To.Reg = reg + p.To.Offset = off +} diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64.rules b/src/cmd/compile/internal/ssa/_gen/ARM64.rules index 15ba10e216..197db974b2 100644 --- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules @@ -392,44 +392,8 @@ (Zero [16] ptr mem) => (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem) -(Zero [32] ptr mem) => - (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) - (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)) - -(Zero [48] ptr mem) => - (STP [32] ptr (MOVDconst [0]) (MOVDconst [0]) - (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) - (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))) - -(Zero [64] ptr mem) => - (STP [48] ptr (MOVDconst [0]) (MOVDconst [0]) - (STP [32] ptr (MOVDconst [0]) (MOVDconst [0]) - (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) - (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))) - -// strip off fractional word zeroing -(Zero [s] ptr mem) && s%16 != 0 && s%16 <= 8 && s > 16 => - (Zero [8] - (OffPtr ptr [s-8]) - (Zero [s-s%16] ptr mem)) -(Zero [s] ptr mem) && s%16 != 0 && s%16 > 8 && s > 16 => - (Zero [16] - (OffPtr ptr [s-16]) - (Zero [s-s%16] ptr mem)) - -// medium zeroing uses a duff device -// 4, 16, and 64 are magic constants, see runtime/mkduff.go -(Zero [s] ptr mem) - && s%16 == 0 && s > 64 && s <= 16*64 => - (DUFFZERO [4 * (64 - s/16)] ptr mem) - -// large zeroing uses a loop -(Zero [s] ptr mem) - && s%16 == 0 && s > 16*64 => - (LoweredZero - ptr - (ADDconst [s-16] ptr) - mem) +(Zero [s] ptr mem) && s > 16 && s < 192 => (LoweredZero [s] ptr mem) +(Zero [s] ptr mem) && s >= 192 => (LoweredZeroLoop [s] ptr mem) // moves (Move [0] _ _ mem) => mem diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go index 69db139ff0..072cc2f4c8 100644 --- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go @@ -536,44 +536,36 @@ func init() { {name: "LessThanNoov", argLength: 1, reg: readflags}, // bool, true flags encode signed x=y but without honoring overflow, false otherwise. - // duffzero + // medium zeroing // arg0 = address of memory to zero // arg1 = mem - // auxint = offset into duffzero code to start executing + // auxint = # of bytes to zero // returns mem - // R20 changed as side effect - // R16 and R17 may be clobbered by linker trampoline. { - name: "DUFFZERO", + name: "LoweredZero", aux: "Int64", argLength: 2, reg: regInfo{ - inputs: []regMask{buildReg("R20")}, - clobbers: buildReg("R16 R17 R20 R30"), + inputs: []regMask{gp}, }, - //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point - unsafePoint: true, // FP maintenance around DUFFZERO can be clobbered by interrupts + faultOnNilArg0: true, }, // large zeroing - // arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect) - // arg1 = address of the last 16-byte unit to zero - // arg2 = mem + // arg0 = address of memory to zero + // arg1 = mem + // auxint = # of bytes to zero // returns mem - // STP.P (ZR,ZR), 16(R16) - // CMP Rarg1, R16 - // BLE -2(PC) - // Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled. - // the-end-of-the-memory - 16 is with the area to zero, ok to spill. { - name: "LoweredZero", - argLength: 3, + name: "LoweredZeroLoop", + aux: "Int64", + argLength: 2, reg: regInfo{ - inputs: []regMask{buildReg("R16"), gp}, - clobbers: buildReg("R16"), + inputs: []regMask{gp}, + clobbersArg0: true, }, - clobberFlags: true, faultOnNilArg0: true, + needIntTemp: true, }, // duffcopy diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index d33933e0d8..95f8d48a61 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1718,8 +1718,8 @@ const ( OpARM64NotGreaterEqualF OpARM64LessThanNoov OpARM64GreaterEqualNoov - OpARM64DUFFZERO OpARM64LoweredZero + OpARM64LoweredZeroLoop OpARM64DUFFCOPY OpARM64LoweredMove OpARM64LoweredGetClosurePtr @@ -23069,28 +23069,27 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "DUFFZERO", - auxType: auxInt64, - argLen: 2, - unsafePoint: true, + name: "LoweredZero", + auxType: auxInt64, + argLen: 2, + faultOnNilArg0: true, reg: regInfo{ inputs: []inputInfo{ - {0, 524288}, // R20 + {0, 335544319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 }, - clobbers: 269156352, // R16 R17 R20 R30 }, }, { - name: "LoweredZero", - argLen: 3, - clobberFlags: true, + name: "LoweredZeroLoop", + auxType: auxInt64, + argLen: 2, + needIntTemp: true, faultOnNilArg0: true, reg: regInfo{ inputs: []inputInfo{ - {0, 65536}, // R16 - {1, 335544319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 + {0, 335544319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 }, - clobbers: 65536, // R16 + clobbersArg0: true, }, }, { diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 32f0f55434..59d6fe64db 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -22321,141 +22321,34 @@ func rewriteValueARM64_OpZero(v *Value) bool { v.AddArg4(ptr, v0, v0, mem) return true } - // match: (Zero [32] ptr mem) - // result: (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)) - for { - if auxIntToInt64(v.AuxInt) != 32 { - break - } - ptr := v_0 - mem := v_1 - v.reset(OpARM64STP) - v.AuxInt = int32ToAuxInt(16) - v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) - v0.AuxInt = int64ToAuxInt(0) - v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem) - v1.AuxInt = int32ToAuxInt(0) - v1.AddArg4(ptr, v0, v0, mem) - v.AddArg4(ptr, v0, v0, v1) - return true - } - // match: (Zero [48] ptr mem) - // result: (STP [32] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))) - for { - if auxIntToInt64(v.AuxInt) != 48 { - break - } - ptr := v_0 - mem := v_1 - v.reset(OpARM64STP) - v.AuxInt = int32ToAuxInt(32) - v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) - v0.AuxInt = int64ToAuxInt(0) - v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem) - v1.AuxInt = int32ToAuxInt(16) - v2 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem) - v2.AuxInt = int32ToAuxInt(0) - v2.AddArg4(ptr, v0, v0, mem) - v1.AddArg4(ptr, v0, v0, v2) - v.AddArg4(ptr, v0, v0, v1) - return true - } - // match: (Zero [64] ptr mem) - // result: (STP [48] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [32] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))) - for { - if auxIntToInt64(v.AuxInt) != 64 { - break - } - ptr := v_0 - mem := v_1 - v.reset(OpARM64STP) - v.AuxInt = int32ToAuxInt(48) - v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) - v0.AuxInt = int64ToAuxInt(0) - v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem) - v1.AuxInt = int32ToAuxInt(32) - v2 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem) - v2.AuxInt = int32ToAuxInt(16) - v3 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem) - v3.AuxInt = int32ToAuxInt(0) - v3.AddArg4(ptr, v0, v0, mem) - v2.AddArg4(ptr, v0, v0, v3) - v1.AddArg4(ptr, v0, v0, v2) - v.AddArg4(ptr, v0, v0, v1) - return true - } - // match: (Zero [s] ptr mem) - // cond: s%16 != 0 && s%16 <= 8 && s > 16 - // result: (Zero [8] (OffPtr ptr [s-8]) (Zero [s-s%16] ptr mem)) - for { - s := auxIntToInt64(v.AuxInt) - ptr := v_0 - mem := v_1 - if !(s%16 != 0 && s%16 <= 8 && s > 16) { - break - } - v.reset(OpZero) - v.AuxInt = int64ToAuxInt(8) - v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type) - v0.AuxInt = int64ToAuxInt(s - 8) - v0.AddArg(ptr) - v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem) - v1.AuxInt = int64ToAuxInt(s - s%16) - v1.AddArg2(ptr, mem) - v.AddArg2(v0, v1) - return true - } - // match: (Zero [s] ptr mem) - // cond: s%16 != 0 && s%16 > 8 && s > 16 - // result: (Zero [16] (OffPtr ptr [s-16]) (Zero [s-s%16] ptr mem)) - for { - s := auxIntToInt64(v.AuxInt) - ptr := v_0 - mem := v_1 - if !(s%16 != 0 && s%16 > 8 && s > 16) { - break - } - v.reset(OpZero) - v.AuxInt = int64ToAuxInt(16) - v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type) - v0.AuxInt = int64ToAuxInt(s - 16) - v0.AddArg(ptr) - v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem) - v1.AuxInt = int64ToAuxInt(s - s%16) - v1.AddArg2(ptr, mem) - v.AddArg2(v0, v1) - return true - } // match: (Zero [s] ptr mem) - // cond: s%16 == 0 && s > 64 && s <= 16*64 - // result: (DUFFZERO [4 * (64 - s/16)] ptr mem) + // cond: s > 16 && s < 192 + // result: (LoweredZero [s] ptr mem) for { s := auxIntToInt64(v.AuxInt) ptr := v_0 mem := v_1 - if !(s%16 == 0 && s > 64 && s <= 16*64) { + if !(s > 16 && s < 192) { break } - v.reset(OpARM64DUFFZERO) - v.AuxInt = int64ToAuxInt(4 * (64 - s/16)) + v.reset(OpARM64LoweredZero) + v.AuxInt = int64ToAuxInt(s) v.AddArg2(ptr, mem) return true } // match: (Zero [s] ptr mem) - // cond: s%16 == 0 && s > 16*64 - // result: (LoweredZero ptr (ADDconst [s-16] ptr) mem) + // cond: s >= 192 + // result: (LoweredZeroLoop [s] ptr mem) for { s := auxIntToInt64(v.AuxInt) ptr := v_0 mem := v_1 - if !(s%16 == 0 && s > 16*64) { + if !(s >= 192) { break } - v.reset(OpARM64LoweredZero) - v0 := b.NewValue0(v.Pos, OpARM64ADDconst, ptr.Type) - v0.AuxInt = int64ToAuxInt(s - 16) - v0.AddArg(ptr) - v.AddArg3(ptr, v0, mem) + v.reset(OpARM64LoweredZeroLoop) + v.AuxInt = int64ToAuxInt(s) + v.AddArg2(ptr, mem) return true } return false