]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: use generated loops instead of DUFFZERO on amd64
authorKeith Randall <khr@golang.org>
Tue, 3 Jun 2025 23:23:02 +0000 (16:23 -0700)
committerKeith Randall <khr@golang.org>
Fri, 1 Aug 2025 00:12:39 +0000 (17:12 -0700)
goarch: amd64
cpu: 12th Gen Intel(R) Core(TM) i7-12700
                        │     base      │                 exp                 │
                        │    sec/op     │   sec/op     vs base                │
MemclrKnownSize112-20      1.270n ± 14%   1.006n ± 0%  -20.72% (p=0.000 n=10)
MemclrKnownSize128-20      1.266n ±  0%   1.005n ± 0%  -20.58% (p=0.000 n=10)
MemclrKnownSize192-20      1.771n ±  0%   1.579n ± 1%  -10.84% (p=0.000 n=10)
MemclrKnownSize248-20      4.034n ±  0%   3.520n ± 0%  -12.75% (p=0.000 n=10)
MemclrKnownSize256-20      2.269n ±  0%   2.014n ± 0%  -11.26% (p=0.000 n=10)
MemclrKnownSize512-20      4.280n ±  0%   4.030n ± 0%   -5.84% (p=0.000 n=10)
MemclrKnownSize1024-20     8.309n ±  1%   8.057n ± 0%   -3.03% (p=0.000 n=10)

Change-Id: I8f1627e2a1e981ff351dc7178932b32a2627f765
Reviewed-on: https://go-review.googlesource.com/c/go/+/678937
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/cmd/compile/internal/amd64/ssa.go
src/cmd/compile/internal/ssa/_gen/AMD64.rules
src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/regalloc_test.go
src/cmd/compile/internal/ssa/rewrite.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
test/codegen/issue52635.go

index c50f187dc8613a886fe7b60d6cac1f38f89cceae..625e725fe34606e1faaec862f8f6bba6ed8cb973 100644 (file)
@@ -1007,26 +1007,103 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                ssagen.AddAux(&p.From, v)
                p.To.Type = obj.TYPE_REG
                p.To.Reg = v.Reg()
-       case ssa.OpAMD64DUFFZERO:
+
+       case ssa.OpAMD64LoweredZero:
                if s.ABI != obj.ABIInternal {
                        // zero X15 manually
                        opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
                }
-               off := duffStart(v.AuxInt)
-               adj := duffAdj(v.AuxInt)
-               var p *obj.Prog
-               if adj != 0 {
-                       p = s.Prog(x86.ALEAQ)
-                       p.From.Type = obj.TYPE_MEM
-                       p.From.Offset = adj
-                       p.From.Reg = x86.REG_DI
-                       p.To.Type = obj.TYPE_REG
-                       p.To.Reg = x86.REG_DI
+               ptrReg := v.Args[0].Reg()
+               n := v.AuxInt
+               if n < 16 {
+                       v.Fatalf("Zero too small %d", n)
                }
-               p = s.Prog(obj.ADUFFZERO)
-               p.To.Type = obj.TYPE_ADDR
-               p.To.Sym = ir.Syms.Duffzero
-               p.To.Offset = off
+               zero16 := func(off int64) {
+                       zero16(s, ptrReg, off)
+               }
+
+               // Generate zeroing instructions.
+               var off int64
+               for n >= 16 {
+                       zero16(off)
+                       off += 16
+                       n -= 16
+               }
+               if n != 0 {
+                       // use partially overlapped write.
+                       // TODO: n <= 8, use smaller write?
+                       zero16(off + n - 16)
+               }
+
+       case ssa.OpAMD64LoweredZeroLoop:
+               if s.ABI != obj.ABIInternal {
+                       // zero X15 manually
+                       opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+               }
+               ptrReg := v.Args[0].Reg()
+               countReg := v.RegTmp()
+               n := v.AuxInt
+               loopSize := int64(64)
+               if n < 3*loopSize {
+                       // - a loop count of 0 won't work.
+                       // - a loop count of 1 is useless.
+                       // - a loop count of 2 is a code size ~tie
+                       //     4 instructions to implement the loop
+                       //     4 instructions in the loop body
+                       //   vs
+                       //     8 instructions in the straightline code
+                       //   Might as well use straightline code.
+                       v.Fatalf("ZeroLoop size too small %d", n)
+               }
+               zero16 := func(off int64) {
+                       zero16(s, ptrReg, off)
+               }
+
+               // Put iteration count in a register.
+               //   MOVL    $n, countReg
+               p := s.Prog(x86.AMOVL)
+               p.From.Type = obj.TYPE_CONST
+               p.From.Offset = n / loopSize
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = countReg
+               cntInit := p
+
+               // Zero loopSize bytes starting at ptrReg.
+               for i := range loopSize / 16 {
+                       zero16(i * 16)
+               }
+               //   ADDQ    $loopSize, ptrReg
+               p = s.Prog(x86.AADDQ)
+               p.From.Type = obj.TYPE_CONST
+               p.From.Offset = loopSize
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = ptrReg
+               //   DECL    countReg
+               p = s.Prog(x86.ADECL)
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = countReg
+               // Jump to first instruction in loop if we're not done yet.
+               //   JNE     head
+               p = s.Prog(x86.AJNE)
+               p.To.Type = obj.TYPE_BRANCH
+               p.To.SetTarget(cntInit.Link)
+
+               // Multiples of the loop size are now done.
+               n %= loopSize
+
+               // Write any fractional portion.
+               var off int64
+               for n >= 16 {
+                       zero16(off)
+                       off += 16
+                       n -= 16
+               }
+               if n != 0 {
+                       // Use partially-overlapping write.
+                       // TODO: n <= 8, use smaller write?
+                       zero16(off + n - 16)
+               }
+
        case ssa.OpAMD64DUFFCOPY:
                p := s.Prog(obj.ADUFFCOPY)
                p.To.Type = obj.TYPE_ADDR
@@ -1621,3 +1698,14 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
        p.Pos = p.Pos.WithNotStmt()
        return p
 }
+
+// zero 16 bytes at reg+off.
+func zero16(s *ssagen.State, reg int16, off int64) {
+       //   MOVUPS  X15, off(ptrReg)
+       p := s.Prog(x86.AMOVUPS)
+       p.From.Type = obj.TYPE_REG
+       p.From.Reg = x86.REG_X15
+       p.To.Type = obj.TYPE_MEM
+       p.To.Reg = reg
+       p.To.Offset = off
+}
index 6013e8111522c28a5980138e284489ee84fbfed9..95e63001269c2df931aa7e810d116d02f1d80c55 100644 (file)
        (MOVQstoreconst [makeValAndOff(0,int32(s-8))] destptr
                (MOVQstoreconst [makeValAndOff(0,0)] destptr mem))
 
-// Adjust zeros to be a multiple of 16 bytes.
-(Zero [s] destptr mem) && s%16 != 0 && s > 16 =>
-       (Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
-               (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
-
-(Zero [16] destptr mem) =>
-       (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)
-(Zero [32] destptr mem) =>
-       (MOVOstoreconst [makeValAndOff(0,16)] destptr
-               (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
-(Zero [48] destptr mem) =>
-       (MOVOstoreconst [makeValAndOff(0,32)] destptr
-               (MOVOstoreconst [makeValAndOff(0,16)] destptr
-                       (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))
-(Zero [64] destptr mem) =>
-       (MOVOstoreconst [makeValAndOff(0,48)] destptr
-               (MOVOstoreconst [makeValAndOff(0,32)] destptr
-                       (MOVOstoreconst [makeValAndOff(0,16)] destptr
-                               (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))))
-
-// Medium zeroing uses a duff device.
-(Zero [s] destptr mem)
-       && s > 64 && s <= 1024 && s%16 == 0 =>
-       (DUFFZERO [s] destptr mem)
+// Zeroing up to 192 bytes uses straightline code.
+(Zero [s] destptr mem) && s >= 16 && s < 192 => (LoweredZero [s] destptr mem)
+
+// Zeroing up to ~1KB uses a small loop.
+(Zero [s] destptr mem) && s >= 192 && s <= repZeroThreshold => (LoweredZeroLoop [s] destptr mem)
 
 // Large zeroing uses REP STOSQ.
-(Zero [s] destptr mem)
-       && s > 1024 && s%8 == 0 =>
+(Zero [s] destptr mem) && s > repZeroThreshold && s%8 != 0 =>
+       (Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8])
+               (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [s] destptr mem) && s > repZeroThreshold && s%8 == 0 =>
        (REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)
 
 // Lowering constants
index dc29559b04ce44caf945afa5f15f7fdf5212430c..b6c019f28aaf901e644514fb7c216ec7d3dde35e 100644 (file)
@@ -889,15 +889,30 @@ func init() {
                // auxint = # of bytes to zero
                // returns mem
                {
-                       name:      "DUFFZERO",
+                       name:      "LoweredZero",
                        aux:       "Int64",
                        argLength: 2,
                        reg: regInfo{
-                               inputs:   []regMask{buildReg("DI")},
-                               clobbers: buildReg("DI"),
+                               inputs: []regMask{gp},
                        },
-                       //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
-                       unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
+                       faultOnNilArg0: true,
+               },
+
+               // arg0 = pointer to start of memory to zero
+               // arg1 = mem
+               // auxint = # of bytes to zero
+               // returns mem
+               {
+                       name:      "LoweredZeroLoop",
+                       aux:       "Int64",
+                       argLength: 2,
+                       reg: regInfo{
+                               inputs:       []regMask{gp},
+                               clobbersArg0: true,
+                       },
+                       clobberFlags:   true,
+                       faultOnNilArg0: true,
+                       needIntTemp:    true,
                },
 
                // arg0 = address of memory to zero
index 36c1815ea2ea7dcf100472b320020044eef96873..541237262ebe19a7e36137cfb22f6e8e27ee95a2 100644 (file)
@@ -1051,7 +1051,8 @@ const (
        OpAMD64MOVLstoreconstidx4
        OpAMD64MOVQstoreconstidx1
        OpAMD64MOVQstoreconstidx8
-       OpAMD64DUFFZERO
+       OpAMD64LoweredZero
+       OpAMD64LoweredZeroLoop
        OpAMD64REPSTOSQ
        OpAMD64CALLstatic
        OpAMD64CALLtail
@@ -13873,15 +13874,28 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:        "DUFFZERO",
-               auxType:     auxInt64,
-               argLen:      2,
-               unsafePoint: true,
+               name:           "LoweredZero",
+               auxType:        auxInt64,
+               argLen:         2,
+               faultOnNilArg0: true,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 128}, // DI
+                               {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                       },
+               },
+       },
+       {
+               name:           "LoweredZeroLoop",
+               auxType:        auxInt64,
+               argLen:         2,
+               clobberFlags:   true,
+               needIntTemp:    true,
+               faultOnNilArg0: true,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
                        },
-                       clobbers: 128, // DI
+                       clobbersArg0: true,
                },
        },
        {
index 7d804a0d30be33674b7eae36ca238586451e994c..e7ed416c507d511431f704810decad4d955dba1f 100644 (file)
@@ -6,6 +6,7 @@ package ssa
 
 import (
        "cmd/compile/internal/types"
+       "fmt"
        "testing"
 )
 
@@ -218,10 +219,37 @@ func TestSpillMove2(t *testing.T) {
 
 }
 
+func TestClobbersArg0(t *testing.T) {
+       c := testConfig(t)
+       f := c.Fun("entry",
+               Bloc("entry",
+                       Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+                       Valu("ptr", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())),
+                       Valu("dst", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())),
+                       Valu("zero", OpAMD64LoweredZeroLoop, types.TypeMem, 256, nil, "ptr", "mem"),
+                       Valu("store", OpAMD64MOVQstore, types.TypeMem, 0, nil, "dst", "ptr", "zero"),
+                       Exit("store")))
+       flagalloc(f.f)
+       regalloc(f.f)
+       checkFunc(f.f)
+       // LoweredZeroLoop clobbers its argument, so there must be a copy of "ptr" somewhere
+       // so we still have that value available at "store".
+       if n := numCopies(f.blocks["entry"]); n != 1 {
+               fmt.Printf("%s\n", f.f.String())
+               t.Errorf("got %d copies, want 1", n)
+       }
+}
+
 func numSpills(b *Block) int {
+       return numOps(b, OpStoreReg)
+}
+func numCopies(b *Block) int {
+       return numOps(b, OpCopy)
+}
+func numOps(b *Block, op Op) int {
        n := 0
        for _, v := range b.Values {
-               if v.Op == OpStoreReg {
+               if v.Op == op {
                        n++
                }
        }
index f6bd4cee5754075665b10f763db3fde6b2438879..f9a35deecc3d0bc4d9709f6aeb2cb1164c7bedb5 100644 (file)
@@ -29,6 +29,8 @@ type deadValueChoice bool
 const (
        leaveDeadValues  deadValueChoice = false
        removeDeadValues                 = true
+
+       repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing
 )
 
 // deadcode indicates whether rewrite should try to remove any values that become dead.
index d2c136369e1b35ce2246823a1f00d47a7e10a92c..3532d42b0cda6eb0cbd11635cac028fe3f93ccbf 100644 (file)
@@ -30025,119 +30025,64 @@ func rewriteValueAMD64_OpZero(v *Value) bool {
                return true
        }
        // match: (Zero [s] destptr mem)
-       // cond: s%16 != 0 && s > 16
-       // result: (Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16]) (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
+       // cond: s >= 16 && s < 192
+       // result: (LoweredZero [s] destptr mem)
        for {
                s := auxIntToInt64(v.AuxInt)
                destptr := v_0
                mem := v_1
-               if !(s%16 != 0 && s > 16) {
+               if !(s >= 16 && s < 192) {
                        break
                }
-               v.reset(OpZero)
-               v.AuxInt = int64ToAuxInt(s - s%16)
-               v0 := b.NewValue0(v.Pos, OpOffPtr, destptr.Type)
-               v0.AuxInt = int64ToAuxInt(s % 16)
-               v0.AddArg(destptr)
-               v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
-               v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
-               v1.AddArg2(destptr, mem)
-               v.AddArg2(v0, v1)
-               return true
-       }
-       // match: (Zero [16] destptr mem)
-       // result: (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)
-       for {
-               if auxIntToInt64(v.AuxInt) != 16 {
-                       break
-               }
-               destptr := v_0
-               mem := v_1
-               v.reset(OpAMD64MOVOstoreconst)
-               v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
+               v.reset(OpAMD64LoweredZero)
+               v.AuxInt = int64ToAuxInt(s)
                v.AddArg2(destptr, mem)
                return true
        }
-       // match: (Zero [32] destptr mem)
-       // result: (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
-       for {
-               if auxIntToInt64(v.AuxInt) != 32 {
-                       break
-               }
-               destptr := v_0
-               mem := v_1
-               v.reset(OpAMD64MOVOstoreconst)
-               v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16))
-               v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
-               v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
-               v0.AddArg2(destptr, mem)
-               v.AddArg2(destptr, v0)
-               return true
-       }
-       // match: (Zero [48] destptr mem)
-       // result: (MOVOstoreconst [makeValAndOff(0,32)] destptr (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))
+       // match: (Zero [s] destptr mem)
+       // cond: s >= 192 && s <= repZeroThreshold
+       // result: (LoweredZeroLoop [s] destptr mem)
        for {
-               if auxIntToInt64(v.AuxInt) != 48 {
-                       break
-               }
+               s := auxIntToInt64(v.AuxInt)
                destptr := v_0
                mem := v_1
-               v.reset(OpAMD64MOVOstoreconst)
-               v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 32))
-               v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
-               v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16))
-               v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
-               v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
-               v1.AddArg2(destptr, mem)
-               v0.AddArg2(destptr, v1)
-               v.AddArg2(destptr, v0)
-               return true
-       }
-       // match: (Zero [64] destptr mem)
-       // result: (MOVOstoreconst [makeValAndOff(0,48)] destptr (MOVOstoreconst [makeValAndOff(0,32)] destptr (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))))
-       for {
-               if auxIntToInt64(v.AuxInt) != 64 {
+               if !(s >= 192 && s <= repZeroThreshold) {
                        break
                }
-               destptr := v_0
-               mem := v_1
-               v.reset(OpAMD64MOVOstoreconst)
-               v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 48))
-               v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
-               v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 32))
-               v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
-               v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16))
-               v2 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
-               v2.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
-               v2.AddArg2(destptr, mem)
-               v1.AddArg2(destptr, v2)
-               v0.AddArg2(destptr, v1)
-               v.AddArg2(destptr, v0)
+               v.reset(OpAMD64LoweredZeroLoop)
+               v.AuxInt = int64ToAuxInt(s)
+               v.AddArg2(destptr, mem)
                return true
        }
        // match: (Zero [s] destptr mem)
-       // cond: s > 64 && s <= 1024 && s%16 == 0
-       // result: (DUFFZERO [s] destptr mem)
+       // cond: s > repZeroThreshold && s%8 != 0
+       // result: (Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8]) (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
        for {
                s := auxIntToInt64(v.AuxInt)
                destptr := v_0
                mem := v_1
-               if !(s > 64 && s <= 1024 && s%16 == 0) {
+               if !(s > repZeroThreshold && s%8 != 0) {
                        break
                }
-               v.reset(OpAMD64DUFFZERO)
-               v.AuxInt = int64ToAuxInt(s)
-               v.AddArg2(destptr, mem)
+               v.reset(OpZero)
+               v.AuxInt = int64ToAuxInt(s - s%8)
+               v0 := b.NewValue0(v.Pos, OpOffPtr, destptr.Type)
+               v0.AuxInt = int64ToAuxInt(s % 8)
+               v0.AddArg(destptr)
+               v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
+               v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
+               v1.AddArg2(destptr, mem)
+               v.AddArg2(v0, v1)
                return true
        }
        // match: (Zero [s] destptr mem)
-       // cond: s > 1024 && s%8 == 0
+       // cond: s > repZeroThreshold && s%8 == 0
        // result: (REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)
        for {
                s := auxIntToInt64(v.AuxInt)
                destptr := v_0
                mem := v_1
-               if !(s > 1024 && s%8 == 0) {
+               if !(s > repZeroThreshold && s%8 == 0) {
                        break
                }
                v.reset(OpAMD64REPSTOSQ)
index 9ee63f0fbeccdd6f070dd5d1c822cb8342ac52d7..65f2a021d63c01e0db6fa7e2242fcaec79ed06c0 100644 (file)
@@ -17,31 +17,31 @@ type T struct {
 
 func (t *T) f() {
        // amd64:-".*runtime.memclrNoHeapPointers"
-       // amd64:"DUFFZERO"
+       // amd64:`MOVUPS\tX15,`
        for i := range t.a {
                t.a[i] = 0
        }
 
        // amd64:-".*runtime.memclrNoHeapPointers"
-       // amd64:"DUFFZERO"
+       // amd64:`MOVUPS\tX15,`
        for i := range *t.a {
                t.a[i] = 0
        }
 
        // amd64:-".*runtime.memclrNoHeapPointers"
-       // amd64:"DUFFZERO"
+       // amd64:`MOVUPS\tX15,`
        for i := range t.a {
                (*t.a)[i] = 0
        }
 
        // amd64:-".*runtime.memclrNoHeapPointers"
-       // amd64:"DUFFZERO"
+       // amd64:`MOVUPS\tX15,`
        for i := range *t.a {
                (*t.a)[i] = 0
        }
 
        // amd64:-".*runtime.memclrNoHeapPointers"
-       // amd64:"DUFFZERO"
+       // amd64:`MOVUPS\tX15,`
        for i := range t.b {
                t.b[i] = 0
        }