]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: Implement LoweredZeroLoop with LSX Instruction on loong64
authorGuoqi Chen <chenguoqi@loongson.cn>
Mon, 17 Nov 2025 03:33:04 +0000 (11:33 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Thu, 20 Nov 2025 03:38:42 +0000 (19:38 -0800)
goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3A6000 @ 2500.00MHz
                   |   old.txt    |               new.txt               |
                   |    sec/op    |   sec/op     vs base                |
ClearFat256           6.406n ± 0%   3.329n ± 1%  -48.03% (p=0.000 n=10)
ClearFat512          12.810n ± 0%   7.607n ± 0%  -40.62% (p=0.000 n=10)
ClearFat1024          25.62n ± 0%   14.01n ± 0%  -45.32% (p=0.000 n=10)
ClearFat1032          26.02n ± 0%   14.28n ± 0%  -45.14% (p=0.000 n=10)
ClearFat1040          26.02n ± 0%   14.41n ± 0%  -44.62% (p=0.000 n=10)
MemclrKnownSize192    4.804n ± 0%   2.827n ± 0%  -41.15% (p=0.000 n=10)
MemclrKnownSize248    6.561n ± 0%   4.371n ± 0%  -33.38% (p=0.000 n=10)
MemclrKnownSize256    6.406n ± 0%   3.335n ± 0%  -47.94% (p=0.000 n=10)
geomean               11.41n        6.453n       -43.45%

goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3C5000 @ 2200.00MHz
                   |   old.txt    |               new.txt               |
                   |    sec/op    |   sec/op     vs base                |
ClearFat256          14.570n ± 0%   7.284n ± 0%  -50.01% (p=0.000 n=10)
ClearFat512           29.13n ± 0%   14.57n ± 0%  -49.98% (p=0.000 n=10)
ClearFat1024          58.26n ± 0%   29.15n ± 0%  -49.97% (p=0.000 n=10)
ClearFat1032          58.73n ± 0%   29.15n ± 0%  -50.36% (p=0.000 n=10)
ClearFat1040          59.18n ± 0%   29.26n ± 0%  -50.56% (p=0.000 n=10)
MemclrKnownSize192   10.930n ± 0%   5.466n ± 0%  -49.99% (p=0.000 n=10)
MemclrKnownSize248   14.110n ± 0%   6.772n ± 0%  -52.01% (p=0.000 n=10)
MemclrKnownSize256   14.570n ± 0%   7.285n ± 0%  -50.00% (p=0.000 n=10)
geomean               25.75n        12.78n       -50.36%

Change-Id: I88d7b6ae2f6fc3f095979f24fb83ff42a9d2d42e
Reviewed-on: https://go-review.googlesource.com/c/go/+/720940
Reviewed-by: Meidan Li <limeidan@loongson.cn>
Reviewed-by: Mark Freeman <markfreeman@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
src/cmd/compile/internal/loong64/ssa.go
src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
src/cmd/compile/internal/ssa/opGen.go

index 84bbf9b394d8bc35db151c0caff7b54469acbc92..71953109c464b3e186872f871d124414af53c2e7 100644 (file)
@@ -575,6 +575,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
        case ssa.OpLOONG64LoweredZeroLoop:
                ptrReg := v.Args[0].Reg()
                countReg := v.RegTmp()
+               flagReg := int16(loong64.REGTMP)
                var off int64
                n := v.AuxInt
                loopSize := int64(64)
@@ -587,58 +588,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                        //   vs
                        //     16 instuctions in the straightline code
                        //   Might as well use straightline code.
-                       v.Fatalf("ZeroLoop size tool small %d", n)
+                       v.Fatalf("ZeroLoop size too small %d", n)
                }
 
-               // Put iteration count in a register.
-               //   MOVV     $n/loopSize, countReg
-               p := s.Prog(loong64.AMOVV)
-               p.From.Type = obj.TYPE_CONST
-               p.From.Offset = n / loopSize
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = countReg
-               cntInit := p
+               //    MOVV    $n/loopSize, countReg
+               //    MOVBU   ir.Syms.Loong64HasLSX, flagReg
+               //    BNE     flagReg, lsxInit
+               // genericInit:
+               //    for off = 0; off < loopSize; off += 8 {
+               //            zero8(s, ptrReg, off)
+               //    }
+               //    ADDV    $loopSize, ptrReg
+               //    SUBV    $1, countReg
+               //    BNE     countReg, genericInit
+               //    JMP     tail
+               // lsxInit:
+               //    VXORV   V31, V31, V31, v31 = 0
+               //    for off = 0; off < loopSize; off += 16 {
+               //            zero16(s, V31, ptrReg, off)
+               //    }
+               //    ADDV    $loopSize, ptrReg
+               //    SUBV    $1, countReg
+               //    BNE     countReg, lsxInit
+               // tail:
+               //    n %= loopSize
+               //    for off = 0; n >= 8; off += 8, n -= 8 {
+               //            zero8(s, ptrReg, off)
+               //    }
+               //
+               //    if n != 0 {
+               //           zero8(s, ptrReg, off+n-8)
+               //    }
 
-               // Zero loopSize bytes starting at ptrReg.
-               for range loopSize / 8 {
-                       // MOVV     ZR, off(ptrReg)
+               p1 := s.Prog(loong64.AMOVV)
+               p1.From.Type = obj.TYPE_CONST
+               p1.From.Offset = n / loopSize
+               p1.To.Type = obj.TYPE_REG
+               p1.To.Reg = countReg
+
+               p2 := s.Prog(loong64.AMOVBU)
+               p2.From.Type = obj.TYPE_MEM
+               p2.From.Name = obj.NAME_EXTERN
+               p2.From.Sym = ir.Syms.Loong64HasLSX
+               p2.To.Type = obj.TYPE_REG
+               p2.To.Reg = flagReg
+
+               p3 := s.Prog(loong64.ABNE)
+               p3.From.Type = obj.TYPE_REG
+               p3.From.Reg = flagReg
+               p3.To.Type = obj.TYPE_BRANCH
+
+               for off = 0; off < loopSize; off += 8 {
                        zero8(s, ptrReg, off)
-                       off += 8
                }
 
-               // Increment ptrReg by loopSize.
-               //   ADDV     $loopSize, ptrReg
-               p = s.Prog(loong64.AADDV)
-               p.From.Type = obj.TYPE_CONST
-               p.From.Offset = loopSize
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = ptrReg
+               p4 := s.Prog(loong64.AADDV)
+               p4.From.Type = obj.TYPE_CONST
+               p4.From.Offset = loopSize
+               p4.To.Type = obj.TYPE_REG
+               p4.To.Reg = ptrReg
 
-               // Decrement loop count.
-               //   SUBV     $1, countReg
-               p = s.Prog(loong64.ASUBV)
-               p.From.Type = obj.TYPE_CONST
-               p.From.Offset = 1
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = countReg
+               p5 := s.Prog(loong64.ASUBV)
+               p5.From.Type = obj.TYPE_CONST
+               p5.From.Offset = 1
+               p5.To.Type = obj.TYPE_REG
+               p5.To.Reg = countReg
 
-               // Jump to loop header if we're not done yet.
-               //   BNE     countReg, loop header
-               p = s.Prog(loong64.ABNE)
-               p.From.Type = obj.TYPE_REG
-               p.From.Reg = countReg
-               p.To.Type = obj.TYPE_BRANCH
-               p.To.SetTarget(cntInit.Link)
+               p6 := s.Prog(loong64.ABNE)
+               p6.From.Type = obj.TYPE_REG
+               p6.From.Reg = countReg
+               p6.To.Type = obj.TYPE_BRANCH
+               p6.To.SetTarget(p3.Link)
+
+               p7 := s.Prog(obj.AJMP)
+               p7.To.Type = obj.TYPE_BRANCH
+
+               p8 := s.Prog(loong64.AVXORV)
+               p8.From.Type = obj.TYPE_REG
+               p8.From.Reg = loong64.REG_V31
+               p8.To.Type = obj.TYPE_REG
+               p8.To.Reg = loong64.REG_V31
+               p3.To.SetTarget(p8)
+
+               for off = 0; off < loopSize; off += 16 {
+                       zero16(s, loong64.REG_V31, ptrReg, off)
+               }
+
+               p9 := s.Prog(loong64.AADDV)
+               p9.From.Type = obj.TYPE_CONST
+               p9.From.Offset = loopSize
+               p9.To.Type = obj.TYPE_REG
+               p9.To.Reg = ptrReg
+
+               p10 := s.Prog(loong64.ASUBV)
+               p10.From.Type = obj.TYPE_CONST
+               p10.From.Offset = 1
+               p10.To.Type = obj.TYPE_REG
+               p10.To.Reg = countReg
+
+               p11 := s.Prog(loong64.ABNE)
+               p11.From.Type = obj.TYPE_REG
+               p11.From.Reg = countReg
+               p11.To.Type = obj.TYPE_BRANCH
+               p11.To.SetTarget(p8.Link)
+
+               p12 := s.Prog(obj.ANOP)
+               p7.To.SetTarget(p12)
 
                // Multiples of the loop size are now done.
                n %= loopSize
-
-               off = 0
                // Write any fractional portion.
-               for n >= 8 {
-                       // MOVV     ZR, off(ptrReg)
+               for off = 0; n >= 8; off += 8 {
+                       // MOVV   ZR, off(ptrReg)
                        zero8(s, ptrReg, off)
-                       off += 8
                        n -= 8
                }
 
@@ -1333,7 +1395,7 @@ func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
 
 // zero8 zeroes 8 bytes at reg+off.
 func zero8(s *ssagen.State, reg int16, off int64) {
-       // MOVV     ZR, off(reg)
+       // MOVV   ZR, off(reg)
        p := s.Prog(loong64.AMOVV)
        p.From.Type = obj.TYPE_REG
        p.From.Reg = loong64.REGZERO
@@ -1341,3 +1403,14 @@ func zero8(s *ssagen.State, reg int16, off int64) {
        p.To.Reg = reg
        p.To.Offset = off
 }
+
+// zero16 zeroes 16 bytes at reg+off.
+func zero16(s *ssagen.State, regZero, regBase int16, off int64) {
+       // VMOVQ   regZero, off(regBase)
+       p := s.Prog(loong64.AVMOVQ)
+       p.From.Type = obj.TYPE_REG
+       p.From.Reg = regZero
+       p.To.Type = obj.TYPE_MEM
+       p.To.Reg = regBase
+       p.To.Offset = off
+}
index 7e8b8bf497b8ffa7d9d7ec745d514e843e41fc30..81d3a3665bdb4fb34acd3b03ea9fc1e125e03b07 100644 (file)
@@ -388,6 +388,7 @@ func init() {
                        argLength: 2,
                        reg: regInfo{
                                inputs:       []regMask{gp},
+                               clobbers:     buildReg("F31"),
                                clobbersArg0: true,
                        },
                        faultOnNilArg0: true,
index 264f4b3bf378f1301e86bf19bd69baf3182f68d4..944e1d78548a9ec2f80a9871c1ed2a258a7398aa 100644 (file)
@@ -26107,6 +26107,7 @@ var opcodeTable = [...]opInfo{
                        inputs: []inputInfo{
                                {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
                        },
+                       clobbers:     2305843009213693952, // F31
                        clobbersArg0: true,
                },
        },