From: Guoqi Chen Date: Mon, 17 Nov 2025 03:33:04 +0000 (+0800) Subject: cmd/compile: Implement LoweredZeroLoop with LSX Instruction on loong64 X-Git-Tag: go1.26rc1~234 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=c4bb9653ba28cba4bcd3a3cbb64285c495a03ba2;p=gostls13.git cmd/compile: Implement LoweredZeroLoop with LSX Instruction on loong64 goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3A6000 @ 2500.00MHz | old.txt | new.txt | | sec/op | sec/op vs base | ClearFat256 6.406n ± 0% 3.329n ± 1% -48.03% (p=0.000 n=10) ClearFat512 12.810n ± 0% 7.607n ± 0% -40.62% (p=0.000 n=10) ClearFat1024 25.62n ± 0% 14.01n ± 0% -45.32% (p=0.000 n=10) ClearFat1032 26.02n ± 0% 14.28n ± 0% -45.14% (p=0.000 n=10) ClearFat1040 26.02n ± 0% 14.41n ± 0% -44.62% (p=0.000 n=10) MemclrKnownSize192 4.804n ± 0% 2.827n ± 0% -41.15% (p=0.000 n=10) MemclrKnownSize248 6.561n ± 0% 4.371n ± 0% -33.38% (p=0.000 n=10) MemclrKnownSize256 6.406n ± 0% 3.335n ± 0% -47.94% (p=0.000 n=10) geomean 11.41n 6.453n -43.45% goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3C5000 @ 2200.00MHz | old.txt | new.txt | | sec/op | sec/op vs base | ClearFat256 14.570n ± 0% 7.284n ± 0% -50.01% (p=0.000 n=10) ClearFat512 29.13n ± 0% 14.57n ± 0% -49.98% (p=0.000 n=10) ClearFat1024 58.26n ± 0% 29.15n ± 0% -49.97% (p=0.000 n=10) ClearFat1032 58.73n ± 0% 29.15n ± 0% -50.36% (p=0.000 n=10) ClearFat1040 59.18n ± 0% 29.26n ± 0% -50.56% (p=0.000 n=10) MemclrKnownSize192 10.930n ± 0% 5.466n ± 0% -49.99% (p=0.000 n=10) MemclrKnownSize248 14.110n ± 0% 6.772n ± 0% -52.01% (p=0.000 n=10) MemclrKnownSize256 14.570n ± 0% 7.285n ± 0% -50.00% (p=0.000 n=10) geomean 25.75n 12.78n -50.36% Change-Id: I88d7b6ae2f6fc3f095979f24fb83ff42a9d2d42e Reviewed-on: https://go-review.googlesource.com/c/go/+/720940 Reviewed-by: Meidan Li Reviewed-by: Mark Freeman LUCI-TryBot-Result: Go LUCI Reviewed-by: sophie zhao Reviewed-by: Keith Randall Reviewed-by: Keith Randall --- diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go index 84bbf9b394..71953109c4 100644 --- a/src/cmd/compile/internal/loong64/ssa.go +++ b/src/cmd/compile/internal/loong64/ssa.go @@ -575,6 +575,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { case ssa.OpLOONG64LoweredZeroLoop: ptrReg := v.Args[0].Reg() countReg := v.RegTmp() + flagReg := int16(loong64.REGTMP) var off int64 n := v.AuxInt loopSize := int64(64) @@ -587,58 +588,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { // vs // 16 instuctions in the straightline code // Might as well use straightline code. - v.Fatalf("ZeroLoop size tool small %d", n) + v.Fatalf("ZeroLoop size too small %d", n) } - // Put iteration count in a register. - // MOVV $n/loopSize, countReg - p := s.Prog(loong64.AMOVV) - p.From.Type = obj.TYPE_CONST - p.From.Offset = n / loopSize - p.To.Type = obj.TYPE_REG - p.To.Reg = countReg - cntInit := p + // MOVV $n/loopSize, countReg + // MOVBU ir.Syms.Loong64HasLSX, flagReg + // BNE flagReg, lsxInit + // genericInit: + // for off = 0; off < loopSize; off += 8 { + // zero8(s, ptrReg, off) + // } + // ADDV $loopSize, ptrReg + // SUBV $1, countReg + // BNE countReg, genericInit + // JMP tail + // lsxInit: + // VXORV V31, V31, V31, v31 = 0 + // for off = 0; off < loopSize; off += 16 { + // zero16(s, V31, ptrReg, off) + // } + // ADDV $loopSize, ptrReg + // SUBV $1, countReg + // BNE countReg, lsxInit + // tail: + // n %= loopSize + // for off = 0; n >= 8; off += 8, n -= 8 { + // zero8(s, ptrReg, off) + // } + // + // if n != 0 { + // zero8(s, ptrReg, off+n-8) + // } - // Zero loopSize bytes starting at ptrReg. - for range loopSize / 8 { - // MOVV ZR, off(ptrReg) + p1 := s.Prog(loong64.AMOVV) + p1.From.Type = obj.TYPE_CONST + p1.From.Offset = n / loopSize + p1.To.Type = obj.TYPE_REG + p1.To.Reg = countReg + + p2 := s.Prog(loong64.AMOVBU) + p2.From.Type = obj.TYPE_MEM + p2.From.Name = obj.NAME_EXTERN + p2.From.Sym = ir.Syms.Loong64HasLSX + p2.To.Type = obj.TYPE_REG + p2.To.Reg = flagReg + + p3 := s.Prog(loong64.ABNE) + p3.From.Type = obj.TYPE_REG + p3.From.Reg = flagReg + p3.To.Type = obj.TYPE_BRANCH + + for off = 0; off < loopSize; off += 8 { zero8(s, ptrReg, off) - off += 8 } - // Increment ptrReg by loopSize. - // ADDV $loopSize, ptrReg - p = s.Prog(loong64.AADDV) - p.From.Type = obj.TYPE_CONST - p.From.Offset = loopSize - p.To.Type = obj.TYPE_REG - p.To.Reg = ptrReg + p4 := s.Prog(loong64.AADDV) + p4.From.Type = obj.TYPE_CONST + p4.From.Offset = loopSize + p4.To.Type = obj.TYPE_REG + p4.To.Reg = ptrReg - // Decrement loop count. - // SUBV $1, countReg - p = s.Prog(loong64.ASUBV) - p.From.Type = obj.TYPE_CONST - p.From.Offset = 1 - p.To.Type = obj.TYPE_REG - p.To.Reg = countReg + p5 := s.Prog(loong64.ASUBV) + p5.From.Type = obj.TYPE_CONST + p5.From.Offset = 1 + p5.To.Type = obj.TYPE_REG + p5.To.Reg = countReg - // Jump to loop header if we're not done yet. - // BNE countReg, loop header - p = s.Prog(loong64.ABNE) - p.From.Type = obj.TYPE_REG - p.From.Reg = countReg - p.To.Type = obj.TYPE_BRANCH - p.To.SetTarget(cntInit.Link) + p6 := s.Prog(loong64.ABNE) + p6.From.Type = obj.TYPE_REG + p6.From.Reg = countReg + p6.To.Type = obj.TYPE_BRANCH + p6.To.SetTarget(p3.Link) + + p7 := s.Prog(obj.AJMP) + p7.To.Type = obj.TYPE_BRANCH + + p8 := s.Prog(loong64.AVXORV) + p8.From.Type = obj.TYPE_REG + p8.From.Reg = loong64.REG_V31 + p8.To.Type = obj.TYPE_REG + p8.To.Reg = loong64.REG_V31 + p3.To.SetTarget(p8) + + for off = 0; off < loopSize; off += 16 { + zero16(s, loong64.REG_V31, ptrReg, off) + } + + p9 := s.Prog(loong64.AADDV) + p9.From.Type = obj.TYPE_CONST + p9.From.Offset = loopSize + p9.To.Type = obj.TYPE_REG + p9.To.Reg = ptrReg + + p10 := s.Prog(loong64.ASUBV) + p10.From.Type = obj.TYPE_CONST + p10.From.Offset = 1 + p10.To.Type = obj.TYPE_REG + p10.To.Reg = countReg + + p11 := s.Prog(loong64.ABNE) + p11.From.Type = obj.TYPE_REG + p11.From.Reg = countReg + p11.To.Type = obj.TYPE_BRANCH + p11.To.SetTarget(p8.Link) + + p12 := s.Prog(obj.ANOP) + p7.To.SetTarget(p12) // Multiples of the loop size are now done. n %= loopSize - - off = 0 // Write any fractional portion. - for n >= 8 { - // MOVV ZR, off(ptrReg) + for off = 0; n >= 8; off += 8 { + // MOVV ZR, off(ptrReg) zero8(s, ptrReg, off) - off += 8 n -= 8 } @@ -1333,7 +1395,7 @@ func move8(s *ssagen.State, src, dst, tmp int16, off int64) { // zero8 zeroes 8 bytes at reg+off. func zero8(s *ssagen.State, reg int16, off int64) { - // MOVV ZR, off(reg) + // MOVV ZR, off(reg) p := s.Prog(loong64.AMOVV) p.From.Type = obj.TYPE_REG p.From.Reg = loong64.REGZERO @@ -1341,3 +1403,14 @@ func zero8(s *ssagen.State, reg int16, off int64) { p.To.Reg = reg p.To.Offset = off } + +// zero16 zeroes 16 bytes at reg+off. +func zero16(s *ssagen.State, regZero, regBase int16, off int64) { + // VMOVQ regZero, off(regBase) + p := s.Prog(loong64.AVMOVQ) + p.From.Type = obj.TYPE_REG + p.From.Reg = regZero + p.To.Type = obj.TYPE_MEM + p.To.Reg = regBase + p.To.Offset = off +} diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go index 7e8b8bf497..81d3a3665b 100644 --- a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go @@ -388,6 +388,7 @@ func init() { argLength: 2, reg: regInfo{ inputs: []regMask{gp}, + clobbers: buildReg("F31"), clobbersArg0: true, }, faultOnNilArg0: true, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 264f4b3bf3..944e1d7854 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -26107,6 +26107,7 @@ var opcodeTable = [...]opInfo{ inputs: []inputInfo{ {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 }, + clobbers: 2305843009213693952, // F31 clobbersArg0: true, }, },