]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: use generated loops instead of DUFFZERO on arm64
authorKeith Randall <khr@golang.org>
Thu, 5 Jun 2025 00:14:01 +0000 (17:14 -0700)
committerKeith Randall <khr@golang.org>
Tue, 12 Aug 2025 16:15:19 +0000 (09:15 -0700)
Change-Id: Ie0c8263f36d1bcfd0edfc4ea6710ae6c113c4d48
Reviewed-on: https://go-review.googlesource.com/c/go/+/678995
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Jorropo <jorropo.pgm@gmail.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
src/cmd/compile/internal/arm64/ssa.go
src/cmd/compile/internal/ssa/_gen/ARM64.rules
src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteARM64.go

index be7887318a44247c26ec7b1a657028b415d16cf7..cd0c2cdfaa741698e49f8f98f7364c459aef69b3 100644 (file)
@@ -1050,33 +1050,118 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                p.From.Offset = int64(condCode)
                p.To.Type = obj.TYPE_REG
                p.To.Reg = v.Reg()
-       case ssa.OpARM64DUFFZERO:
-               // runtime.duffzero expects start address in R20
-               p := s.Prog(obj.ADUFFZERO)
-               p.To.Type = obj.TYPE_MEM
-               p.To.Name = obj.NAME_EXTERN
-               p.To.Sym = ir.Syms.Duffzero
-               p.To.Offset = v.AuxInt
        case ssa.OpARM64LoweredZero:
-               // STP.P        (ZR,ZR), 16(R16)
-               // CMP  Rarg1, R16
-               // BLE  -2(PC)
-               // arg1 is the address of the last 16-byte unit to zero
-               p := s.Prog(arm64.ASTP)
-               p.Scond = arm64.C_XPOST
-               p.From.Type = obj.TYPE_REGREG
-               p.From.Reg = arm64.REGZERO
-               p.From.Offset = int64(arm64.REGZERO)
-               p.To.Type = obj.TYPE_MEM
-               p.To.Reg = arm64.REG_R16
-               p.To.Offset = 16
-               p2 := s.Prog(arm64.ACMP)
-               p2.From.Type = obj.TYPE_REG
-               p2.From.Reg = v.Args[1].Reg()
-               p2.Reg = arm64.REG_R16
-               p3 := s.Prog(arm64.ABLE)
-               p3.To.Type = obj.TYPE_BRANCH
-               p3.To.SetTarget(p)
+               ptrReg := v.Args[0].Reg()
+               n := v.AuxInt
+               if n < 16 {
+                       v.Fatalf("Zero too small %d", n)
+               }
+
+               // Generate zeroing instructions.
+               var off int64
+               for n >= 16 {
+                       //  STP     (ZR, ZR), off(ptrReg)
+                       zero16(s, ptrReg, off, false)
+                       off += 16
+                       n -= 16
+               }
+               // Write any fractional portion.
+               // An overlapping 16-byte write can't be used here
+               // because STP's offsets must be a multiple of 8.
+               if n > 8 {
+                       //  MOVD    ZR, off(ptrReg)
+                       zero8(s, ptrReg, off)
+                       off += 8
+                       n -= 8
+               }
+               if n != 0 {
+                       //  MOVD    ZR, off+n-8(ptrReg)
+                       // TODO: for n<=4 we could use a smaller write.
+                       zero8(s, ptrReg, off+n-8)
+               }
+       case ssa.OpARM64LoweredZeroLoop:
+               ptrReg := v.Args[0].Reg()
+               countReg := v.RegTmp()
+               n := v.AuxInt
+               loopSize := int64(64)
+               if n < 3*loopSize {
+                       // - a loop count of 0 won't work.
+                       // - a loop count of 1 is useless.
+                       // - a loop count of 2 is a code size ~tie
+                       //     3 instructions to implement the loop
+                       //     4 instructions in the loop body
+                       //   vs
+                       //     8 instructions in the straightline code
+                       //   Might as well use straightline code.
+                       v.Fatalf("ZeroLoop size too small %d", n)
+               }
+
+               // Put iteration count in a register.
+               //   MOVD    $n, countReg
+               p := s.Prog(arm64.AMOVD)
+               p.From.Type = obj.TYPE_CONST
+               p.From.Offset = n / loopSize
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = countReg
+               cntInit := p
+
+               // Zero loopSize bytes starting at ptrReg.
+               // Increment ptrReg by loopSize as a side effect.
+               for range loopSize / 16 {
+                       //  STP.P   (ZR, ZR), 16(ptrReg)
+                       zero16(s, ptrReg, 0, true)
+                       // TODO: should we use the postincrement form,
+                       // or use a separate += 64 instruction?
+                       // postincrement saves an instruction, but maybe
+                       // it requires more integer units to do the +=16s.
+               }
+               // Decrement loop count.
+               //   SUB     $1, countReg
+               p = s.Prog(arm64.ASUB)
+               p.From.Type = obj.TYPE_CONST
+               p.From.Offset = 1
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = countReg
+               // Jump to loop header if we're not done yet.
+               //   CBNZ    head
+               p = s.Prog(arm64.ACBNZ)
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = countReg
+               p.To.Type = obj.TYPE_BRANCH
+               p.To.SetTarget(cntInit.Link)
+
+               // Multiples of the loop size are now done.
+               n %= loopSize
+
+               // Write any fractional portion.
+               var off int64
+               for n >= 16 {
+                       //  STP     (ZR, ZR), off(ptrReg)
+                       zero16(s, ptrReg, off, false)
+                       off += 16
+                       n -= 16
+               }
+               if n > 8 {
+                       // Note: an overlapping 16-byte write can't be used
+                       // here because STP's offsets must be a multiple of 8.
+                       //  MOVD    ZR, off(ptrReg)
+                       zero8(s, ptrReg, off)
+                       off += 8
+                       n -= 8
+               }
+               if n != 0 {
+                       //  MOVD    ZR, off+n-8(ptrReg)
+                       // TODO: for n<=4 we could use a smaller write.
+                       zero8(s, ptrReg, off+n-8)
+               }
+               // TODO: maybe we should use the count register to instead
+               // hold an end pointer and compare against that?
+               //   ADD $n, ptrReg, endReg
+               // then
+               //   CMP ptrReg, endReg
+               //   BNE loop
+               // There's a past-the-end pointer here, any problem with that?
+
        case ssa.OpARM64DUFFCOPY:
                p := s.Prog(obj.ADUFFCOPY)
                p.To.Type = obj.TYPE_MEM
@@ -1482,3 +1567,35 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
        p.Pos = p.Pos.WithNotStmt()
        return p
 }
+
+// zero16 zeroes 16 bytes at reg+off.
+// If postInc is true, increment reg by 16.
+func zero16(s *ssagen.State, reg int16, off int64, postInc bool) {
+       //   STP     (ZR, ZR), off(reg)
+       p := s.Prog(arm64.ASTP)
+       p.From.Type = obj.TYPE_REGREG
+       p.From.Reg = arm64.REGZERO
+       p.From.Offset = int64(arm64.REGZERO)
+       p.To.Type = obj.TYPE_MEM
+       p.To.Reg = reg
+       p.To.Offset = off
+       if postInc {
+               if off != 0 {
+                       panic("can't postinc with non-zero offset")
+               }
+               //   STP.P  (ZR, ZR), 16(reg)
+               p.Scond = arm64.C_XPOST
+               p.To.Offset = 16
+       }
+}
+
+// zero8 zeroes 8 bytes at reg+off.
+func zero8(s *ssagen.State, reg int16, off int64) {
+       //   MOVD     ZR, off(reg)
+       p := s.Prog(arm64.AMOVD)
+       p.From.Type = obj.TYPE_REG
+       p.From.Reg = arm64.REGZERO
+       p.To.Type = obj.TYPE_MEM
+       p.To.Reg = reg
+       p.To.Offset = off
+}
index 15ba10e216be190d53cb4a6489cba7d0e3ab5c39..197db974b2a501f86eaf068f6c63788278bf0015 100644 (file)
 (Zero [16] ptr mem) =>
        (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)
 
-(Zero [32] ptr mem) =>
-       (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
-               (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))
-
-(Zero [48] ptr mem) =>
-       (STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
-               (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
-                       (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))
-
-(Zero [64] ptr mem) =>
-       (STP [48] ptr (MOVDconst [0]) (MOVDconst [0])
-               (STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
-                       (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
-                               (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
-
-// strip off fractional word zeroing
-(Zero [s] ptr mem) && s%16 != 0 && s%16 <= 8 && s > 16 =>
-       (Zero [8]
-               (OffPtr <ptr.Type> ptr [s-8])
-               (Zero [s-s%16] ptr mem))
-(Zero [s] ptr mem) && s%16 != 0 && s%16 > 8 && s > 16 =>
-       (Zero [16]
-               (OffPtr <ptr.Type> ptr [s-16])
-               (Zero [s-s%16] ptr mem))
-
-// medium zeroing uses a duff device
-// 4, 16, and 64 are magic constants, see runtime/mkduff.go
-(Zero [s] ptr mem)
-       && s%16 == 0 && s > 64 && s <= 16*64 =>
-       (DUFFZERO [4 * (64 - s/16)] ptr mem)
-
-// large zeroing uses a loop
-(Zero [s] ptr mem)
-       && s%16 == 0 && s > 16*64 =>
-       (LoweredZero
-               ptr
-               (ADDconst <ptr.Type> [s-16] ptr)
-               mem)
+(Zero [s] ptr mem) && s > 16 && s < 192 => (LoweredZero [s] ptr mem)
+(Zero [s] ptr mem) && s >= 192 => (LoweredZeroLoop [s] ptr mem)
 
 // moves
 (Move [0] _   _   mem) => mem
index 69db139ff027bea984a4f7da63dbd8827c5cda6e..072cc2f4c83f3aa6e50615431f945d830a0893f7 100644 (file)
@@ -536,44 +536,36 @@ func init() {
                {name: "LessThanNoov", argLength: 1, reg: readflags},     // bool, true flags encode signed x<y but without honoring overflow, false otherwise.
                {name: "GreaterEqualNoov", argLength: 1, reg: readflags}, // bool, true flags encode signed x>=y but without honoring overflow, false otherwise.
 
-               // duffzero
+               // medium zeroing
                // arg0 = address of memory to zero
                // arg1 = mem
-               // auxint = offset into duffzero code to start executing
+               // auxint = # of bytes to zero
                // returns mem
-               // R20 changed as side effect
-               // R16 and R17 may be clobbered by linker trampoline.
                {
-                       name:      "DUFFZERO",
+                       name:      "LoweredZero",
                        aux:       "Int64",
                        argLength: 2,
                        reg: regInfo{
-                               inputs:   []regMask{buildReg("R20")},
-                               clobbers: buildReg("R16 R17 R20 R30"),
+                               inputs: []regMask{gp},
                        },
-                       //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
-                       unsafePoint: true, // FP maintenance around DUFFZERO can be clobbered by interrupts
+                       faultOnNilArg0: true,
                },
 
                // large zeroing
-               // arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect)
-               // arg1 = address of the last 16-byte unit to zero
-               // arg2 = mem
+               // arg0 = address of memory to zero
+               // arg1 = mem
+               // auxint = # of bytes to zero
                // returns mem
-               //      STP.P   (ZR,ZR), 16(R16)
-               //      CMP     Rarg1, R16
-               //      BLE     -2(PC)
-               // Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled.
-               // the-end-of-the-memory - 16 is with the area to zero, ok to spill.
                {
-                       name:      "LoweredZero",
-                       argLength: 3,
+                       name:      "LoweredZeroLoop",
+                       aux:       "Int64",
+                       argLength: 2,
                        reg: regInfo{
-                               inputs:   []regMask{buildReg("R16"), gp},
-                               clobbers: buildReg("R16"),
+                               inputs:       []regMask{gp},
+                               clobbersArg0: true,
                        },
-                       clobberFlags:   true,
                        faultOnNilArg0: true,
+                       needIntTemp:    true,
                },
 
                // duffcopy
index d33933e0d8c525fb78f764bcdd6dc3825faf542f..95f8d48a614d5bf74cd6183b0a69853d6f0dbb49 100644 (file)
@@ -1718,8 +1718,8 @@ const (
        OpARM64NotGreaterEqualF
        OpARM64LessThanNoov
        OpARM64GreaterEqualNoov
-       OpARM64DUFFZERO
        OpARM64LoweredZero
+       OpARM64LoweredZeroLoop
        OpARM64DUFFCOPY
        OpARM64LoweredMove
        OpARM64LoweredGetClosurePtr
@@ -23069,28 +23069,27 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:        "DUFFZERO",
-               auxType:     auxInt64,
-               argLen:      2,
-               unsafePoint: true,
+               name:           "LoweredZero",
+               auxType:        auxInt64,
+               argLen:         2,
+               faultOnNilArg0: true,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 524288}, // R20
+                               {0, 335544319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
                        },
-                       clobbers: 269156352, // R16 R17 R20 R30
                },
        },
        {
-               name:           "LoweredZero",
-               argLen:         3,
-               clobberFlags:   true,
+               name:           "LoweredZeroLoop",
+               auxType:        auxInt64,
+               argLen:         2,
+               needIntTemp:    true,
                faultOnNilArg0: true,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 65536},     // R16
-                               {1, 335544319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+                               {0, 335544319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
                        },
-                       clobbers: 65536, // R16
+                       clobbersArg0: true,
                },
        },
        {
index 32f0f554341f4513dd3f0600855f99745d257015..59d6fe64db5001ec9d6413e1d94ef7f3476b7d5d 100644 (file)
@@ -22321,141 +22321,34 @@ func rewriteValueARM64_OpZero(v *Value) bool {
                v.AddArg4(ptr, v0, v0, mem)
                return true
        }
-       // match: (Zero [32] ptr mem)
-       // result: (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))
-       for {
-               if auxIntToInt64(v.AuxInt) != 32 {
-                       break
-               }
-               ptr := v_0
-               mem := v_1
-               v.reset(OpARM64STP)
-               v.AuxInt = int32ToAuxInt(16)
-               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-               v0.AuxInt = int64ToAuxInt(0)
-               v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
-               v1.AuxInt = int32ToAuxInt(0)
-               v1.AddArg4(ptr, v0, v0, mem)
-               v.AddArg4(ptr, v0, v0, v1)
-               return true
-       }
-       // match: (Zero [48] ptr mem)
-       // result: (STP [32] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))
-       for {
-               if auxIntToInt64(v.AuxInt) != 48 {
-                       break
-               }
-               ptr := v_0
-               mem := v_1
-               v.reset(OpARM64STP)
-               v.AuxInt = int32ToAuxInt(32)
-               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-               v0.AuxInt = int64ToAuxInt(0)
-               v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
-               v1.AuxInt = int32ToAuxInt(16)
-               v2 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
-               v2.AuxInt = int32ToAuxInt(0)
-               v2.AddArg4(ptr, v0, v0, mem)
-               v1.AddArg4(ptr, v0, v0, v2)
-               v.AddArg4(ptr, v0, v0, v1)
-               return true
-       }
-       // match: (Zero [64] ptr mem)
-       // result: (STP [48] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [32] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
-       for {
-               if auxIntToInt64(v.AuxInt) != 64 {
-                       break
-               }
-               ptr := v_0
-               mem := v_1
-               v.reset(OpARM64STP)
-               v.AuxInt = int32ToAuxInt(48)
-               v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-               v0.AuxInt = int64ToAuxInt(0)
-               v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
-               v1.AuxInt = int32ToAuxInt(32)
-               v2 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
-               v2.AuxInt = int32ToAuxInt(16)
-               v3 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
-               v3.AuxInt = int32ToAuxInt(0)
-               v3.AddArg4(ptr, v0, v0, mem)
-               v2.AddArg4(ptr, v0, v0, v3)
-               v1.AddArg4(ptr, v0, v0, v2)
-               v.AddArg4(ptr, v0, v0, v1)
-               return true
-       }
-       // match: (Zero [s] ptr mem)
-       // cond: s%16 != 0 && s%16 <= 8 && s > 16
-       // result: (Zero [8] (OffPtr <ptr.Type> ptr [s-8]) (Zero [s-s%16] ptr mem))
-       for {
-               s := auxIntToInt64(v.AuxInt)
-               ptr := v_0
-               mem := v_1
-               if !(s%16 != 0 && s%16 <= 8 && s > 16) {
-                       break
-               }
-               v.reset(OpZero)
-               v.AuxInt = int64ToAuxInt(8)
-               v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
-               v0.AuxInt = int64ToAuxInt(s - 8)
-               v0.AddArg(ptr)
-               v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
-               v1.AuxInt = int64ToAuxInt(s - s%16)
-               v1.AddArg2(ptr, mem)
-               v.AddArg2(v0, v1)
-               return true
-       }
-       // match: (Zero [s] ptr mem)
-       // cond: s%16 != 0 && s%16 > 8 && s > 16
-       // result: (Zero [16] (OffPtr <ptr.Type> ptr [s-16]) (Zero [s-s%16] ptr mem))
-       for {
-               s := auxIntToInt64(v.AuxInt)
-               ptr := v_0
-               mem := v_1
-               if !(s%16 != 0 && s%16 > 8 && s > 16) {
-                       break
-               }
-               v.reset(OpZero)
-               v.AuxInt = int64ToAuxInt(16)
-               v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
-               v0.AuxInt = int64ToAuxInt(s - 16)
-               v0.AddArg(ptr)
-               v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
-               v1.AuxInt = int64ToAuxInt(s - s%16)
-               v1.AddArg2(ptr, mem)
-               v.AddArg2(v0, v1)
-               return true
-       }
        // match: (Zero [s] ptr mem)
-       // cond: s%16 == 0 && s > 64 && s <= 16*64
-       // result: (DUFFZERO [4 * (64 - s/16)] ptr mem)
+       // cond: s > 16 && s < 192
+       // result: (LoweredZero [s] ptr mem)
        for {
                s := auxIntToInt64(v.AuxInt)
                ptr := v_0
                mem := v_1
-               if !(s%16 == 0 && s > 64 && s <= 16*64) {
+               if !(s > 16 && s < 192) {
                        break
                }
-               v.reset(OpARM64DUFFZERO)
-               v.AuxInt = int64ToAuxInt(4 * (64 - s/16))
+               v.reset(OpARM64LoweredZero)
+               v.AuxInt = int64ToAuxInt(s)
                v.AddArg2(ptr, mem)
                return true
        }
        // match: (Zero [s] ptr mem)
-       // cond: s%16 == 0 && s > 16*64
-       // result: (LoweredZero ptr (ADDconst <ptr.Type> [s-16] ptr) mem)
+       // cond: s >= 192
+       // result: (LoweredZeroLoop [s] ptr mem)
        for {
                s := auxIntToInt64(v.AuxInt)
                ptr := v_0
                mem := v_1
-               if !(s%16 == 0 && s > 16*64) {
+               if !(s >= 192) {
                        break
                }
-               v.reset(OpARM64LoweredZero)
-               v0 := b.NewValue0(v.Pos, OpARM64ADDconst, ptr.Type)
-               v0.AuxInt = int64ToAuxInt(s - 16)
-               v0.AddArg(ptr)
-               v.AddArg3(ptr, v0, mem)
+               v.reset(OpARM64LoweredZeroLoop)
+               v.AuxInt = int64ToAuxInt(s)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false