]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: use arm64 neon in LoweredMemmove/LoweredMemmoveLoop
authorVasily Leonenko <vasiliy.leonenko@gmail.com>
Fri, 25 Jul 2025 20:06:33 +0000 (23:06 +0300)
committerKeith Randall <khr@golang.org>
Wed, 15 Oct 2025 16:38:03 +0000 (09:38 -0700)
Raspberry Pi 5 (Cortex-A76)

                     │   base.log   │               opt.log               │
                     │    sec/op    │   sec/op     vs base                │
MemmoveKnownSize112     3.549n ± 0%   3.652n ± 0%   +2.92% (p=0.000 n=10)
MemmoveKnownSize128     3.979n ± 0%   3.617n ± 0%   -9.09% (p=0.000 n=10)
MemmoveKnownSize192     7.566n ± 0%   5.074n ± 0%  -32.94% (p=0.000 n=10)
MemmoveKnownSize248     8.549n ± 0%   7.184n ± 1%  -15.97% (p=0.000 n=10)
MemmoveKnownSize256    10.010n ± 0%   6.827n ± 0%  -31.80% (p=0.000 n=10)
MemmoveKnownSize512     19.81n ± 0%   13.59n ± 0%  -31.40% (p=0.000 n=10)
MemmoveKnownSize1024    39.66n ± 0%   27.00n ± 0%  -31.93% (p=0.000 n=10)
geomean                 9.538n        7.392n       -22.50%

Change-Id: I7b17408cd0a500ceaa80bc93ffe2f19ddeea9c0d
Reviewed-on: https://go-review.googlesource.com/c/go/+/692315
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/cmd/compile/internal/arm64/ssa.go
src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
src/cmd/compile/internal/ssa/opGen.go

index 7bc0e536e941e645c44b32223f76e03248893c54..43ecb6b4b715b42dba957eab28d3d3d6f25a19ae 100644 (file)
@@ -1189,8 +1189,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                if dstReg == srcReg {
                        break
                }
-               tmpReg1 := int16(arm64.REG_R24)
-               tmpReg2 := int16(arm64.REG_R25)
+               tmpReg1 := int16(arm64.REG_R25)
+               tmpFReg1 := int16(arm64.REG_F16)
+               tmpFReg2 := int16(arm64.REG_F17)
                n := v.AuxInt
                if n < 16 {
                        v.Fatalf("Move too small %d", n)
@@ -1198,10 +1199,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 
                // Generate copying instructions.
                var off int64
+               for n >= 32 {
+                       //  FLDPQ   off(srcReg), (tmpFReg1, tmpFReg2)
+                       //  FSTPQ   (tmpFReg1, tmpFReg2), off(dstReg)
+                       move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false)
+                       off += 32
+                       n -= 32
+               }
                for n >= 16 {
-                       // LDP     off(srcReg), (tmpReg1, tmpReg2)
-                       // STP     (tmpReg1, tmpReg2), off(dstReg)
-                       move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
+                       //  FMOVQ   off(src), tmpFReg1
+                       //  FMOVQ   tmpFReg1, off(dst)
+                       move16(s, srcReg, dstReg, tmpFReg1, off, false)
                        off += 16
                        n -= 16
                }
@@ -1223,9 +1231,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                if dstReg == srcReg {
                        break
                }
-               countReg := int16(arm64.REG_R23)
-               tmpReg1 := int16(arm64.REG_R24)
-               tmpReg2 := int16(arm64.REG_R25)
+               countReg := int16(arm64.REG_R24)
+               tmpReg1 := int16(arm64.REG_R25)
+               tmpFReg1 := int16(arm64.REG_F16)
+               tmpFReg2 := int16(arm64.REG_F17)
                n := v.AuxInt
                loopSize := int64(64)
                if n < 3*loopSize {
@@ -1251,10 +1260,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 
                // Move loopSize bytes starting at srcReg to dstReg.
                // Increment srcReg and destReg by loopSize as a side effect.
-               for range loopSize / 16 {
-                       // LDP.P  16(srcReg), (tmpReg1, tmpReg2)
-                       // STP.P  (tmpReg1, tmpReg2), 16(dstReg)
-                       move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true)
+               for range loopSize / 32 {
+                       // FLDPQ.P 32(srcReg), (tmpFReg1, tmpFReg2)
+                       // FSTPQ.P (tmpFReg1, tmpFReg2), 32(dstReg)
+                       move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, 0, true)
                }
                // Decrement loop count.
                //   SUB     $1, countReg
@@ -1276,10 +1285,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 
                // Copy any fractional portion.
                var off int64
+               for n >= 32 {
+                       //  FLDPQ   off(srcReg), (tmpFReg1, tmpFReg2)
+                       //  FSTPQ   (tmpFReg1, tmpFReg2), off(dstReg)
+                       move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false)
+                       off += 32
+                       n -= 32
+               }
                for n >= 16 {
-                       //  LDP     off(srcReg), (tmpReg1, tmpReg2)
-                       //  STP     (tmpReg1, tmpReg2), off(dstReg)
-                       move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
+                       //  FMOVQ   off(src), tmpFReg1
+                       //  FMOVQ   tmpFReg1, off(dst)
+                       move16(s, srcReg, dstReg, tmpFReg1, off, false)
                        off += 16
                        n -= 16
                }
@@ -1699,26 +1715,55 @@ func zero8(s *ssagen.State, reg int16, off int64) {
        p.To.Offset = off
 }
 
-// move16 copies 16 bytes at src+off to dst+off.
+// move32 copies 32 bytes at src+off to dst+off.
 // Uses registers tmp1 and tmp2.
-// If postInc is true, increment src and dst by 16.
-func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
-       // LDP     off(src), (tmp1, tmp2)
-       ld := s.Prog(arm64.ALDP)
+// If postInc is true, increment src and dst by 32.
+func move32(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
+       // FLDPQ   off(src), (tmp1, tmp2)
+       ld := s.Prog(arm64.AFLDPQ)
        ld.From.Type = obj.TYPE_MEM
        ld.From.Reg = src
        ld.From.Offset = off
        ld.To.Type = obj.TYPE_REGREG
        ld.To.Reg = tmp1
        ld.To.Offset = int64(tmp2)
-       // STP     (tmp1, tmp2), off(dst)
-       st := s.Prog(arm64.ASTP)
+       // FSTPQ   (tmp1, tmp2), off(dst)
+       st := s.Prog(arm64.AFSTPQ)
        st.From.Type = obj.TYPE_REGREG
        st.From.Reg = tmp1
        st.From.Offset = int64(tmp2)
        st.To.Type = obj.TYPE_MEM
        st.To.Reg = dst
        st.To.Offset = off
+       if postInc {
+               if off != 0 {
+                       panic("can't postinc with non-zero offset")
+               }
+               ld.Scond = arm64.C_XPOST
+               st.Scond = arm64.C_XPOST
+               ld.From.Offset = 32
+               st.To.Offset = 32
+       }
+}
+
+// move16 copies 16 bytes at src+off to dst+off.
+// Uses register tmp1
+// If postInc is true, increment src and dst by 16.
+func move16(s *ssagen.State, src, dst, tmp1 int16, off int64, postInc bool) {
+       // FMOVQ     off(src), tmp1
+       ld := s.Prog(arm64.AFMOVQ)
+       ld.From.Type = obj.TYPE_MEM
+       ld.From.Reg = src
+       ld.From.Offset = off
+       ld.To.Type = obj.TYPE_REG
+       ld.To.Reg = tmp1
+       // FMOVQ     tmp1, off(dst)
+       st := s.Prog(arm64.AFMOVQ)
+       st.From.Type = obj.TYPE_REG
+       st.From.Reg = tmp1
+       st.To.Type = obj.TYPE_MEM
+       st.To.Reg = dst
+       st.To.Offset = off
        if postInc {
                if off != 0 {
                        panic("can't postinc with non-zero offset")
index 43072ae9130ede9c6c18c23c401fef81ce348b87..cc3758d10956d4c6c8c0e3b636a20348ac13141d 100644 (file)
@@ -144,8 +144,9 @@ func init() {
                gpspsbg    = gpspg | buildReg("SB")
                fp         = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
                callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+               r25        = buildReg("R25")
                r24to25    = buildReg("R24 R25")
-               r23to25    = buildReg("R23 R24 R25")
+               f16to17    = buildReg("F16 F17")
                rz         = buildReg("ZERO")
                first16    = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
        )
@@ -599,8 +600,8 @@ func init() {
                        aux:       "Int64",
                        argLength: 3,
                        reg: regInfo{
-                               inputs:   []regMask{gp &^ r24to25, gp &^ r24to25},
-                               clobbers: r24to25, // TODO: figure out needIntTemp x2
+                               inputs:   []regMask{gp &^ r25, gp &^ r25},
+                               clobbers: r25 | f16to17, // TODO: figure out needIntTemp + x2 for floats
                        },
                        faultOnNilArg0: true,
                        faultOnNilArg1: true,
@@ -617,8 +618,8 @@ func init() {
                        aux:       "Int64",
                        argLength: 3,
                        reg: regInfo{
-                               inputs:       []regMask{gp &^ r23to25, gp &^ r23to25},
-                               clobbers:     r23to25, // TODO: figure out needIntTemp x3
+                               inputs:       []regMask{gp &^ r24to25, gp &^ r24to25},
+                               clobbers:     r24to25 | f16to17, // TODO: figure out needIntTemp x2 + x2 for floats
                                clobbersArg0: true,
                                clobbersArg1: true,
                        },
index 9b38e66a23f0191b93542ff8767a0bb6e164199f..061f1333382af4c9a75e5f941a9304192087581d 100644 (file)
@@ -23199,10 +23199,10 @@ var opcodeTable = [...]opInfo{
                faultOnNilArg1: true,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
-                               {1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
+                               {0, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
+                               {1, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
                        },
-                       clobbers: 25165824, // R24 R25
+                       clobbers: 422212481843200, // R25 F16 F17
                },
        },
        {
@@ -23213,10 +23213,10 @@ var opcodeTable = [...]opInfo{
                faultOnNilArg1: true,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
-                               {1, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
+                               {0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
+                               {1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
                        },
-                       clobbers:     29360128, // R23 R24 R25
+                       clobbers:     422212490231808, // R24 R25 F16 F17
                        clobbersArg0: true,
                        clobbersArg1: true,
                },