From: limeidan Date: Thu, 28 Aug 2025 11:22:51 +0000 (+0800) Subject: cmd/compile: use generated loops instead of DUFFZERO on loong64 X-Git-Tag: go1.26rc1~983 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=7bba745820;p=gostls13.git cmd/compile: use generated loops instead of DUFFZERO on loong64 Change-Id: Id43ee4353d4bac96627f8b0f54545cdd3d2a1d1b Reviewed-on: https://go-review.googlesource.com/c/go/+/699695 Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI Reviewed-by: Carlos Amedee Reviewed-by: abner chenc --- diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go index 895eadd072..bdc8e37b04 100644 --- a/src/cmd/compile/internal/loong64/ssa.go +++ b/src/cmd/compile/internal/loong64/ssa.go @@ -560,28 +560,97 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.To.Sym = ir.Syms.Duffzero p.To.Offset = v.AuxInt case ssa.OpLOONG64LoweredZero: - // MOVx R0, (Rarg0) - // ADDV $sz, Rarg0 - // BGEU Rarg1, Rarg0, -2(PC) - mov, sz := largestMove(v.AuxInt) - p := s.Prog(mov) - p.From.Type = obj.TYPE_REG - p.From.Reg = loong64.REGZERO - p.To.Type = obj.TYPE_MEM - p.To.Reg = v.Args[0].Reg() + ptrReg := v.Args[0].Reg() + n := v.AuxInt + if n < 16 { + v.Fatalf("Zero too small %d", n) + } - p2 := s.Prog(loong64.AADDVU) - p2.From.Type = obj.TYPE_CONST - p2.From.Offset = sz - p2.To.Type = obj.TYPE_REG - p2.To.Reg = v.Args[0].Reg() + // Generate Zeroing instructions. + var off int64 + for n >= 8 { + // MOVV ZR, off(ptrReg) + zero8(s, ptrReg, off) + off += 8 + n -= 8 + } + if n != 0 { + // MOVV ZR, off+n-8(ptrReg) + zero8(s, ptrReg, off+n-8) + } + case ssa.OpLOONG64LoweredZeroLoop: + ptrReg := v.Args[0].Reg() + countReg := v.RegTmp() + var off int64 + n := v.AuxInt + loopSize := int64(64) + if n < 3*loopSize { + // - a loop count of 0 won't work. + // - a loop count of 1 is useless. + // - a loop count of 2 is a code size ~tie + // 4 instructions to implement the loop + // 8 instructions in the loop body + // vs + // 16 instuctions in the straightline code + // Might as well use straightline code. + v.Fatalf("ZeroLoop size tool small %d", n) + } - p3 := s.Prog(loong64.ABGEU) - p3.From.Type = obj.TYPE_REG - p3.From.Reg = v.Args[1].Reg() - p3.Reg = v.Args[0].Reg() - p3.To.Type = obj.TYPE_BRANCH - p3.To.SetTarget(p) + // Put iteration count in a register. + // MOVV $n/loopSize, countReg + p := s.Prog(loong64.AMOVV) + p.From.Type = obj.TYPE_CONST + p.From.Offset = n / loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg + cntInit := p + + // Zero loopSize bytes starting at ptrReg. + for range loopSize / 8 { + // MOVV ZR, off(ptrReg) + zero8(s, ptrReg, off) + off += 8 + } + + // Increment ptrReg by loopSize. + // ADDV $loopSize, ptrReg + p = s.Prog(loong64.AADDV) + p.From.Type = obj.TYPE_CONST + p.From.Offset = loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = ptrReg + + // Decrement loop count. + // SUBV $1, countReg + p = s.Prog(loong64.ASUBV) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 1 + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg + + // Jump to loop header if we're not done yet. + // BNE countReg, loop header + p = s.Prog(loong64.ABNE) + p.From.Type = obj.TYPE_REG + p.From.Reg = countReg + p.To.Type = obj.TYPE_BRANCH + p.To.SetTarget(cntInit.Link) + + // Multiples of the loop size are now done. + n %= loopSize + + off = 0 + // Write any fractional portion. + for n >= 8 { + // MOVV ZR, off(ptrReg) + zero8(s, ptrReg, off) + off += 8 + n -= 8 + } + + if n != 0 { + zero8(s, ptrReg, off+n-8) + } case ssa.OpLOONG64DUFFCOPY: p := s.Prog(obj.ADUFFCOPY) @@ -1155,3 +1224,14 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in p.Pos = p.Pos.WithNotStmt() return p } + +// zero8 zeroes 8 bytes at reg+off. +func zero8(s *ssagen.State, reg int16, off int64) { + // MOVV ZR, off(reg) + p := s.Prog(loong64.AMOVV) + p.From.Type = obj.TYPE_REG + p.From.Reg = loong64.REGZERO + p.To.Type = obj.TYPE_MEM + p.To.Reg = reg + p.To.Offset = off +} diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules index ca04bdcd42..6dd28c6d45 100644 --- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules @@ -373,24 +373,8 @@ (MOVVstore [8] ptr (MOVVconst [0]) (MOVVstore ptr (MOVVconst [0]) mem)) -// strip off fractional word zeroing -(Zero [s] ptr mem) && s%8 != 0 && s > 16 => - (Zero [s%8] - (OffPtr ptr [s-s%8]) - (Zero [s-s%8] ptr mem)) - -// medium zeroing uses a duff device -(Zero [s] ptr mem) - && s%8 == 0 && s > 16 && s <= 8*128 => - (DUFFZERO [8 * (128 - s/8)] ptr mem) - -// large zeroing uses a loop -(Zero [s] ptr mem) - && s%8 == 0 && s > 8*128 => - (LoweredZero - ptr - (ADDVconst ptr [s-8]) - mem) +(Zero [s] ptr mem) && s > 16 && s < 192 => (LoweredZero [s] ptr mem) +(Zero [s] ptr mem) && s >= 192 => (LoweredZeroLoop [s] ptr mem) // moves (Move [0] _ _ mem) => mem diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go index ccd9721498..ed635bfd97 100644 --- a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go @@ -376,6 +376,21 @@ func init() { faultOnNilArg0: true, }, + // medium zeroing + // arg0 = address of memory to zero + // arg1 = mem + // auxint = number of bytes to zero + // returns mem + { + name: "LoweredZero", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{gp}, + }, + faultOnNilArg0: true, + }, + // duffcopy // arg0 = address of dst memory (in R21, changed as side effect) // arg1 = address of src memory (in R20, changed as side effect) @@ -395,25 +410,21 @@ func init() { faultOnNilArg1: true, }, - // large or unaligned zeroing - // arg0 = address of memory to zero (in R20, changed as side effect) - // arg1 = address of the last element to zero - // arg2 = mem - // auxint = alignment + // large zeroing + // arg0 = address of memory to zero + // arg1 = mem + // auxint = number of bytes to zero // returns mem - // MOVx R0, (R20) - // ADDV $sz, R20 - // BGEU Rarg1, R20, -2(PC) { - name: "LoweredZero", + name: "LoweredZeroLoop", aux: "Int64", - argLength: 3, + argLength: 2, reg: regInfo{ - inputs: []regMask{buildReg("R20"), gp}, - clobbers: buildReg("R20"), + inputs: []regMask{gp}, + clobbersArg0: true, }, - typ: "Mem", faultOnNilArg0: true, + needIntTemp: true, }, // large or unaligned move diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 126682b986..7bdb14cec9 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1923,8 +1923,9 @@ const ( OpLOONG64CALLclosure OpLOONG64CALLinter OpLOONG64DUFFZERO - OpLOONG64DUFFCOPY OpLOONG64LoweredZero + OpLOONG64DUFFCOPY + OpLOONG64LoweredZeroLoop OpLOONG64LoweredMove OpLOONG64LoweredAtomicLoad8 OpLOONG64LoweredAtomicLoad32 @@ -25912,6 +25913,17 @@ var opcodeTable = [...]opInfo{ clobbers: 524290, // R1 R20 }, }, + { + name: "LoweredZero", + auxType: auxInt64, + argLen: 2, + faultOnNilArg0: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 + }, + }, + }, { name: "DUFFCOPY", auxType: auxInt64, @@ -25927,16 +25939,16 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "LoweredZero", + name: "LoweredZeroLoop", auxType: auxInt64, - argLen: 3, + argLen: 2, + needIntTemp: true, faultOnNilArg0: true, reg: regInfo{ inputs: []inputInfo{ - {0, 524288}, // R20 - {1, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 + {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 }, - clobbers: 524288, // R20 + clobbersArg0: true, }, }, { diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64.go b/src/cmd/compile/internal/ssa/rewriteLOONG64.go index eb134789f7..6407628a8b 100644 --- a/src/cmd/compile/internal/ssa/rewriteLOONG64.go +++ b/src/cmd/compile/internal/ssa/rewriteLOONG64.go @@ -11497,56 +11497,33 @@ func rewriteValueLOONG64_OpZero(v *Value) bool { return true } // match: (Zero [s] ptr mem) - // cond: s%8 != 0 && s > 16 - // result: (Zero [s%8] (OffPtr ptr [s-s%8]) (Zero [s-s%8] ptr mem)) - for { - s := auxIntToInt64(v.AuxInt) - ptr := v_0 - mem := v_1 - if !(s%8 != 0 && s > 16) { - break - } - v.reset(OpZero) - v.AuxInt = int64ToAuxInt(s % 8) - v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type) - v0.AuxInt = int64ToAuxInt(s - s%8) - v0.AddArg(ptr) - v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem) - v1.AuxInt = int64ToAuxInt(s - s%8) - v1.AddArg2(ptr, mem) - v.AddArg2(v0, v1) - return true - } - // match: (Zero [s] ptr mem) - // cond: s%8 == 0 && s > 16 && s <= 8*128 - // result: (DUFFZERO [8 * (128 - s/8)] ptr mem) + // cond: s > 16 && s < 192 + // result: (LoweredZero [s] ptr mem) for { s := auxIntToInt64(v.AuxInt) ptr := v_0 mem := v_1 - if !(s%8 == 0 && s > 16 && s <= 8*128) { + if !(s > 16 && s < 192) { break } - v.reset(OpLOONG64DUFFZERO) - v.AuxInt = int64ToAuxInt(8 * (128 - s/8)) + v.reset(OpLOONG64LoweredZero) + v.AuxInt = int64ToAuxInt(s) v.AddArg2(ptr, mem) return true } // match: (Zero [s] ptr mem) - // cond: s%8 == 0 && s > 8*128 - // result: (LoweredZero ptr (ADDVconst ptr [s-8]) mem) + // cond: s >= 192 + // result: (LoweredZeroLoop [s] ptr mem) for { s := auxIntToInt64(v.AuxInt) ptr := v_0 mem := v_1 - if !(s%8 == 0 && s > 8*128) { + if !(s >= 192) { break } - v.reset(OpLOONG64LoweredZero) - v0 := b.NewValue0(v.Pos, OpLOONG64ADDVconst, ptr.Type) - v0.AuxInt = int64ToAuxInt(s - 8) - v0.AddArg(ptr) - v.AddArg3(ptr, v0, mem) + v.reset(OpLOONG64LoweredZeroLoop) + v.AuxInt = int64ToAuxInt(s) + v.AddArg2(ptr, mem) return true } return false