From: Keith Randall Date: Tue, 3 Jun 2025 23:23:02 +0000 (-0700) Subject: cmd/compile: use generated loops instead of DUFFZERO on amd64 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=eb7f515c4d920c884ba7c37939f016a153c466e9;p=gostls13.git cmd/compile: use generated loops instead of DUFFZERO on amd64 goarch: amd64 cpu: 12th Gen Intel(R) Core(TM) i7-12700 │ base │ exp │ │ sec/op │ sec/op vs base │ MemclrKnownSize112-20 1.270n ± 14% 1.006n ± 0% -20.72% (p=0.000 n=10) MemclrKnownSize128-20 1.266n ± 0% 1.005n ± 0% -20.58% (p=0.000 n=10) MemclrKnownSize192-20 1.771n ± 0% 1.579n ± 1% -10.84% (p=0.000 n=10) MemclrKnownSize248-20 4.034n ± 0% 3.520n ± 0% -12.75% (p=0.000 n=10) MemclrKnownSize256-20 2.269n ± 0% 2.014n ± 0% -11.26% (p=0.000 n=10) MemclrKnownSize512-20 4.280n ± 0% 4.030n ± 0% -5.84% (p=0.000 n=10) MemclrKnownSize1024-20 8.309n ± 1% 8.057n ± 0% -3.03% (p=0.000 n=10) Change-Id: I8f1627e2a1e981ff351dc7178932b32a2627f765 Reviewed-on: https://go-review.googlesource.com/c/go/+/678937 Reviewed-by: Keith Randall Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI --- diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index c50f187dc8..625e725fe3 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1007,26 +1007,103 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { ssagen.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() - case ssa.OpAMD64DUFFZERO: + + case ssa.OpAMD64LoweredZero: if s.ABI != obj.ABIInternal { // zero X15 manually opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) } - off := duffStart(v.AuxInt) - adj := duffAdj(v.AuxInt) - var p *obj.Prog - if adj != 0 { - p = s.Prog(x86.ALEAQ) - p.From.Type = obj.TYPE_MEM - p.From.Offset = adj - p.From.Reg = x86.REG_DI - p.To.Type = obj.TYPE_REG - p.To.Reg = x86.REG_DI + ptrReg := v.Args[0].Reg() + n := v.AuxInt + if n < 16 { + v.Fatalf("Zero too small %d", n) } - p = s.Prog(obj.ADUFFZERO) - p.To.Type = obj.TYPE_ADDR - p.To.Sym = ir.Syms.Duffzero - p.To.Offset = off + zero16 := func(off int64) { + zero16(s, ptrReg, off) + } + + // Generate zeroing instructions. + var off int64 + for n >= 16 { + zero16(off) + off += 16 + n -= 16 + } + if n != 0 { + // use partially overlapped write. + // TODO: n <= 8, use smaller write? + zero16(off + n - 16) + } + + case ssa.OpAMD64LoweredZeroLoop: + if s.ABI != obj.ABIInternal { + // zero X15 manually + opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + } + ptrReg := v.Args[0].Reg() + countReg := v.RegTmp() + n := v.AuxInt + loopSize := int64(64) + if n < 3*loopSize { + // - a loop count of 0 won't work. + // - a loop count of 1 is useless. + // - a loop count of 2 is a code size ~tie + // 4 instructions to implement the loop + // 4 instructions in the loop body + // vs + // 8 instructions in the straightline code + // Might as well use straightline code. + v.Fatalf("ZeroLoop size too small %d", n) + } + zero16 := func(off int64) { + zero16(s, ptrReg, off) + } + + // Put iteration count in a register. + // MOVL $n, countReg + p := s.Prog(x86.AMOVL) + p.From.Type = obj.TYPE_CONST + p.From.Offset = n / loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg + cntInit := p + + // Zero loopSize bytes starting at ptrReg. + for i := range loopSize / 16 { + zero16(i * 16) + } + // ADDQ $loopSize, ptrReg + p = s.Prog(x86.AADDQ) + p.From.Type = obj.TYPE_CONST + p.From.Offset = loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = ptrReg + // DECL countReg + p = s.Prog(x86.ADECL) + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg + // Jump to first instruction in loop if we're not done yet. + // JNE head + p = s.Prog(x86.AJNE) + p.To.Type = obj.TYPE_BRANCH + p.To.SetTarget(cntInit.Link) + + // Multiples of the loop size are now done. + n %= loopSize + + // Write any fractional portion. + var off int64 + for n >= 16 { + zero16(off) + off += 16 + n -= 16 + } + if n != 0 { + // Use partially-overlapping write. + // TODO: n <= 8, use smaller write? + zero16(off + n - 16) + } + case ssa.OpAMD64DUFFCOPY: p := s.Prog(obj.ADUFFCOPY) p.To.Type = obj.TYPE_ADDR @@ -1621,3 +1698,14 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in p.Pos = p.Pos.WithNotStmt() return p } + +// zero 16 bytes at reg+off. +func zero16(s *ssagen.State, reg int16, off int64) { + // MOVUPS X15, off(ptrReg) + p := s.Prog(x86.AMOVUPS) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_X15 + p.To.Type = obj.TYPE_MEM + p.To.Reg = reg + p.To.Offset = off +} diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 6013e81115..95e6300126 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -375,34 +375,17 @@ (MOVQstoreconst [makeValAndOff(0,int32(s-8))] destptr (MOVQstoreconst [makeValAndOff(0,0)] destptr mem)) -// Adjust zeros to be a multiple of 16 bytes. -(Zero [s] destptr mem) && s%16 != 0 && s > 16 => - (Zero [s-s%16] (OffPtr destptr [s%16]) - (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)) - -(Zero [16] destptr mem) => - (MOVOstoreconst [makeValAndOff(0,0)] destptr mem) -(Zero [32] destptr mem) => - (MOVOstoreconst [makeValAndOff(0,16)] destptr - (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)) -(Zero [48] destptr mem) => - (MOVOstoreconst [makeValAndOff(0,32)] destptr - (MOVOstoreconst [makeValAndOff(0,16)] destptr - (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))) -(Zero [64] destptr mem) => - (MOVOstoreconst [makeValAndOff(0,48)] destptr - (MOVOstoreconst [makeValAndOff(0,32)] destptr - (MOVOstoreconst [makeValAndOff(0,16)] destptr - (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))) - -// Medium zeroing uses a duff device. -(Zero [s] destptr mem) - && s > 64 && s <= 1024 && s%16 == 0 => - (DUFFZERO [s] destptr mem) +// Zeroing up to 192 bytes uses straightline code. +(Zero [s] destptr mem) && s >= 16 && s < 192 => (LoweredZero [s] destptr mem) + +// Zeroing up to ~1KB uses a small loop. +(Zero [s] destptr mem) && s >= 192 && s <= repZeroThreshold => (LoweredZeroLoop [s] destptr mem) // Large zeroing uses REP STOSQ. -(Zero [s] destptr mem) - && s > 1024 && s%8 == 0 => +(Zero [s] destptr mem) && s > repZeroThreshold && s%8 != 0 => + (Zero [s-s%8] (OffPtr destptr [s%8]) + (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)) +(Zero [s] destptr mem) && s > repZeroThreshold && s%8 == 0 => (REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem) // Lowering constants diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index dc29559b04..b6c019f28a 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -889,15 +889,30 @@ func init() { // auxint = # of bytes to zero // returns mem { - name: "DUFFZERO", + name: "LoweredZero", aux: "Int64", argLength: 2, reg: regInfo{ - inputs: []regMask{buildReg("DI")}, - clobbers: buildReg("DI"), + inputs: []regMask{gp}, }, - //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point - unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts + faultOnNilArg0: true, + }, + + // arg0 = pointer to start of memory to zero + // arg1 = mem + // auxint = # of bytes to zero + // returns mem + { + name: "LoweredZeroLoop", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{gp}, + clobbersArg0: true, + }, + clobberFlags: true, + faultOnNilArg0: true, + needIntTemp: true, }, // arg0 = address of memory to zero diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 36c1815ea2..541237262e 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1051,7 +1051,8 @@ const ( OpAMD64MOVLstoreconstidx4 OpAMD64MOVQstoreconstidx1 OpAMD64MOVQstoreconstidx8 - OpAMD64DUFFZERO + OpAMD64LoweredZero + OpAMD64LoweredZeroLoop OpAMD64REPSTOSQ OpAMD64CALLstatic OpAMD64CALLtail @@ -13873,15 +13874,28 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "DUFFZERO", - auxType: auxInt64, - argLen: 2, - unsafePoint: true, + name: "LoweredZero", + auxType: auxInt64, + argLen: 2, + faultOnNilArg0: true, reg: regInfo{ inputs: []inputInfo{ - {0, 128}, // DI + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "LoweredZeroLoop", + auxType: auxInt64, + argLen: 2, + clobberFlags: true, + needIntTemp: true, + faultOnNilArg0: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 }, - clobbers: 128, // DI + clobbersArg0: true, }, }, { diff --git a/src/cmd/compile/internal/ssa/regalloc_test.go b/src/cmd/compile/internal/ssa/regalloc_test.go index 7d804a0d30..e7ed416c50 100644 --- a/src/cmd/compile/internal/ssa/regalloc_test.go +++ b/src/cmd/compile/internal/ssa/regalloc_test.go @@ -6,6 +6,7 @@ package ssa import ( "cmd/compile/internal/types" + "fmt" "testing" ) @@ -218,10 +219,37 @@ func TestSpillMove2(t *testing.T) { } +func TestClobbersArg0(t *testing.T) { + c := testConfig(t) + f := c.Fun("entry", + Bloc("entry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("ptr", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())), + Valu("dst", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())), + Valu("zero", OpAMD64LoweredZeroLoop, types.TypeMem, 256, nil, "ptr", "mem"), + Valu("store", OpAMD64MOVQstore, types.TypeMem, 0, nil, "dst", "ptr", "zero"), + Exit("store"))) + flagalloc(f.f) + regalloc(f.f) + checkFunc(f.f) + // LoweredZeroLoop clobbers its argument, so there must be a copy of "ptr" somewhere + // so we still have that value available at "store". + if n := numCopies(f.blocks["entry"]); n != 1 { + fmt.Printf("%s\n", f.f.String()) + t.Errorf("got %d copies, want 1", n) + } +} + func numSpills(b *Block) int { + return numOps(b, OpStoreReg) +} +func numCopies(b *Block) int { + return numOps(b, OpCopy) +} +func numOps(b *Block, op Op) int { n := 0 for _, v := range b.Values { - if v.Op == OpStoreReg { + if v.Op == op { n++ } } diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index f6bd4cee57..f9a35deecc 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -29,6 +29,8 @@ type deadValueChoice bool const ( leaveDeadValues deadValueChoice = false removeDeadValues = true + + repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing ) // deadcode indicates whether rewrite should try to remove any values that become dead. diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index d2c136369e..3532d42b0c 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -30025,119 +30025,64 @@ func rewriteValueAMD64_OpZero(v *Value) bool { return true } // match: (Zero [s] destptr mem) - // cond: s%16 != 0 && s > 16 - // result: (Zero [s-s%16] (OffPtr destptr [s%16]) (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)) + // cond: s >= 16 && s < 192 + // result: (LoweredZero [s] destptr mem) for { s := auxIntToInt64(v.AuxInt) destptr := v_0 mem := v_1 - if !(s%16 != 0 && s > 16) { + if !(s >= 16 && s < 192) { break } - v.reset(OpZero) - v.AuxInt = int64ToAuxInt(s - s%16) - v0 := b.NewValue0(v.Pos, OpOffPtr, destptr.Type) - v0.AuxInt = int64ToAuxInt(s % 16) - v0.AddArg(destptr) - v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem) - v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0)) - v1.AddArg2(destptr, mem) - v.AddArg2(v0, v1) - return true - } - // match: (Zero [16] destptr mem) - // result: (MOVOstoreconst [makeValAndOff(0,0)] destptr mem) - for { - if auxIntToInt64(v.AuxInt) != 16 { - break - } - destptr := v_0 - mem := v_1 - v.reset(OpAMD64MOVOstoreconst) - v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0)) + v.reset(OpAMD64LoweredZero) + v.AuxInt = int64ToAuxInt(s) v.AddArg2(destptr, mem) return true } - // match: (Zero [32] destptr mem) - // result: (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)) - for { - if auxIntToInt64(v.AuxInt) != 32 { - break - } - destptr := v_0 - mem := v_1 - v.reset(OpAMD64MOVOstoreconst) - v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16)) - v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem) - v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0)) - v0.AddArg2(destptr, mem) - v.AddArg2(destptr, v0) - return true - } - // match: (Zero [48] destptr mem) - // result: (MOVOstoreconst [makeValAndOff(0,32)] destptr (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))) + // match: (Zero [s] destptr mem) + // cond: s >= 192 && s <= repZeroThreshold + // result: (LoweredZeroLoop [s] destptr mem) for { - if auxIntToInt64(v.AuxInt) != 48 { - break - } + s := auxIntToInt64(v.AuxInt) destptr := v_0 mem := v_1 - v.reset(OpAMD64MOVOstoreconst) - v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 32)) - v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem) - v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16)) - v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem) - v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0)) - v1.AddArg2(destptr, mem) - v0.AddArg2(destptr, v1) - v.AddArg2(destptr, v0) - return true - } - // match: (Zero [64] destptr mem) - // result: (MOVOstoreconst [makeValAndOff(0,48)] destptr (MOVOstoreconst [makeValAndOff(0,32)] destptr (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))) - for { - if auxIntToInt64(v.AuxInt) != 64 { + if !(s >= 192 && s <= repZeroThreshold) { break } - destptr := v_0 - mem := v_1 - v.reset(OpAMD64MOVOstoreconst) - v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 48)) - v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem) - v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 32)) - v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem) - v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16)) - v2 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem) - v2.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0)) - v2.AddArg2(destptr, mem) - v1.AddArg2(destptr, v2) - v0.AddArg2(destptr, v1) - v.AddArg2(destptr, v0) + v.reset(OpAMD64LoweredZeroLoop) + v.AuxInt = int64ToAuxInt(s) + v.AddArg2(destptr, mem) return true } // match: (Zero [s] destptr mem) - // cond: s > 64 && s <= 1024 && s%16 == 0 - // result: (DUFFZERO [s] destptr mem) + // cond: s > repZeroThreshold && s%8 != 0 + // result: (Zero [s-s%8] (OffPtr destptr [s%8]) (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)) for { s := auxIntToInt64(v.AuxInt) destptr := v_0 mem := v_1 - if !(s > 64 && s <= 1024 && s%16 == 0) { + if !(s > repZeroThreshold && s%8 != 0) { break } - v.reset(OpAMD64DUFFZERO) - v.AuxInt = int64ToAuxInt(s) - v.AddArg2(destptr, mem) + v.reset(OpZero) + v.AuxInt = int64ToAuxInt(s - s%8) + v0 := b.NewValue0(v.Pos, OpOffPtr, destptr.Type) + v0.AuxInt = int64ToAuxInt(s % 8) + v0.AddArg(destptr) + v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem) + v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0)) + v1.AddArg2(destptr, mem) + v.AddArg2(v0, v1) return true } // match: (Zero [s] destptr mem) - // cond: s > 1024 && s%8 == 0 + // cond: s > repZeroThreshold && s%8 == 0 // result: (REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem) for { s := auxIntToInt64(v.AuxInt) destptr := v_0 mem := v_1 - if !(s > 1024 && s%8 == 0) { + if !(s > repZeroThreshold && s%8 == 0) { break } v.reset(OpAMD64REPSTOSQ) diff --git a/test/codegen/issue52635.go b/test/codegen/issue52635.go index 9ee63f0fbe..65f2a021d6 100644 --- a/test/codegen/issue52635.go +++ b/test/codegen/issue52635.go @@ -17,31 +17,31 @@ type T struct { func (t *T) f() { // amd64:-".*runtime.memclrNoHeapPointers" - // amd64:"DUFFZERO" + // amd64:`MOVUPS\tX15,` for i := range t.a { t.a[i] = 0 } // amd64:-".*runtime.memclrNoHeapPointers" - // amd64:"DUFFZERO" + // amd64:`MOVUPS\tX15,` for i := range *t.a { t.a[i] = 0 } // amd64:-".*runtime.memclrNoHeapPointers" - // amd64:"DUFFZERO" + // amd64:`MOVUPS\tX15,` for i := range t.a { (*t.a)[i] = 0 } // amd64:-".*runtime.memclrNoHeapPointers" - // amd64:"DUFFZERO" + // amd64:`MOVUPS\tX15,` for i := range *t.a { (*t.a)[i] = 0 } // amd64:-".*runtime.memclrNoHeapPointers" - // amd64:"DUFFZERO" + // amd64:`MOVUPS\tX15,` for i := range t.b { t.b[i] = 0 }