From: Vasily Leonenko Date: Fri, 25 Jul 2025 20:06:33 +0000 (+0300) Subject: cmd/compile: use arm64 neon in LoweredMemmove/LoweredMemmoveLoop X-Git-Tag: go1.26rc1~612 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=5c9a26c7f882dba5bfe10036815bcb239dd9b7e8;p=gostls13.git cmd/compile: use arm64 neon in LoweredMemmove/LoweredMemmoveLoop Raspberry Pi 5 (Cortex-A76) │ base.log │ opt.log │ │ sec/op │ sec/op vs base │ MemmoveKnownSize112 3.549n ± 0% 3.652n ± 0% +2.92% (p=0.000 n=10) MemmoveKnownSize128 3.979n ± 0% 3.617n ± 0% -9.09% (p=0.000 n=10) MemmoveKnownSize192 7.566n ± 0% 5.074n ± 0% -32.94% (p=0.000 n=10) MemmoveKnownSize248 8.549n ± 0% 7.184n ± 1% -15.97% (p=0.000 n=10) MemmoveKnownSize256 10.010n ± 0% 6.827n ± 0% -31.80% (p=0.000 n=10) MemmoveKnownSize512 19.81n ± 0% 13.59n ± 0% -31.40% (p=0.000 n=10) MemmoveKnownSize1024 39.66n ± 0% 27.00n ± 0% -31.93% (p=0.000 n=10) geomean 9.538n 7.392n -22.50% Change-Id: I7b17408cd0a500ceaa80bc93ffe2f19ddeea9c0d Reviewed-on: https://go-review.googlesource.com/c/go/+/692315 Reviewed-by: Keith Randall Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index 7bc0e536e9..43ecb6b4b7 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -1189,8 +1189,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { if dstReg == srcReg { break } - tmpReg1 := int16(arm64.REG_R24) - tmpReg2 := int16(arm64.REG_R25) + tmpReg1 := int16(arm64.REG_R25) + tmpFReg1 := int16(arm64.REG_F16) + tmpFReg2 := int16(arm64.REG_F17) n := v.AuxInt if n < 16 { v.Fatalf("Move too small %d", n) @@ -1198,10 +1199,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { // Generate copying instructions. var off int64 + for n >= 32 { + // FLDPQ off(srcReg), (tmpFReg1, tmpFReg2) + // FSTPQ (tmpFReg1, tmpFReg2), off(dstReg) + move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false) + off += 32 + n -= 32 + } for n >= 16 { - // LDP off(srcReg), (tmpReg1, tmpReg2) - // STP (tmpReg1, tmpReg2), off(dstReg) - move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false) + // FMOVQ off(src), tmpFReg1 + // FMOVQ tmpFReg1, off(dst) + move16(s, srcReg, dstReg, tmpFReg1, off, false) off += 16 n -= 16 } @@ -1223,9 +1231,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { if dstReg == srcReg { break } - countReg := int16(arm64.REG_R23) - tmpReg1 := int16(arm64.REG_R24) - tmpReg2 := int16(arm64.REG_R25) + countReg := int16(arm64.REG_R24) + tmpReg1 := int16(arm64.REG_R25) + tmpFReg1 := int16(arm64.REG_F16) + tmpFReg2 := int16(arm64.REG_F17) n := v.AuxInt loopSize := int64(64) if n < 3*loopSize { @@ -1251,10 +1260,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { // Move loopSize bytes starting at srcReg to dstReg. // Increment srcReg and destReg by loopSize as a side effect. - for range loopSize / 16 { - // LDP.P 16(srcReg), (tmpReg1, tmpReg2) - // STP.P (tmpReg1, tmpReg2), 16(dstReg) - move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true) + for range loopSize / 32 { + // FLDPQ.P 32(srcReg), (tmpFReg1, tmpFReg2) + // FSTPQ.P (tmpFReg1, tmpFReg2), 32(dstReg) + move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, 0, true) } // Decrement loop count. // SUB $1, countReg @@ -1276,10 +1285,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { // Copy any fractional portion. var off int64 + for n >= 32 { + // FLDPQ off(srcReg), (tmpFReg1, tmpFReg2) + // FSTPQ (tmpFReg1, tmpFReg2), off(dstReg) + move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false) + off += 32 + n -= 32 + } for n >= 16 { - // LDP off(srcReg), (tmpReg1, tmpReg2) - // STP (tmpReg1, tmpReg2), off(dstReg) - move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false) + // FMOVQ off(src), tmpFReg1 + // FMOVQ tmpFReg1, off(dst) + move16(s, srcReg, dstReg, tmpFReg1, off, false) off += 16 n -= 16 } @@ -1699,26 +1715,55 @@ func zero8(s *ssagen.State, reg int16, off int64) { p.To.Offset = off } -// move16 copies 16 bytes at src+off to dst+off. +// move32 copies 32 bytes at src+off to dst+off. // Uses registers tmp1 and tmp2. -// If postInc is true, increment src and dst by 16. -func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) { - // LDP off(src), (tmp1, tmp2) - ld := s.Prog(arm64.ALDP) +// If postInc is true, increment src and dst by 32. +func move32(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) { + // FLDPQ off(src), (tmp1, tmp2) + ld := s.Prog(arm64.AFLDPQ) ld.From.Type = obj.TYPE_MEM ld.From.Reg = src ld.From.Offset = off ld.To.Type = obj.TYPE_REGREG ld.To.Reg = tmp1 ld.To.Offset = int64(tmp2) - // STP (tmp1, tmp2), off(dst) - st := s.Prog(arm64.ASTP) + // FSTPQ (tmp1, tmp2), off(dst) + st := s.Prog(arm64.AFSTPQ) st.From.Type = obj.TYPE_REGREG st.From.Reg = tmp1 st.From.Offset = int64(tmp2) st.To.Type = obj.TYPE_MEM st.To.Reg = dst st.To.Offset = off + if postInc { + if off != 0 { + panic("can't postinc with non-zero offset") + } + ld.Scond = arm64.C_XPOST + st.Scond = arm64.C_XPOST + ld.From.Offset = 32 + st.To.Offset = 32 + } +} + +// move16 copies 16 bytes at src+off to dst+off. +// Uses register tmp1 +// If postInc is true, increment src and dst by 16. +func move16(s *ssagen.State, src, dst, tmp1 int16, off int64, postInc bool) { + // FMOVQ off(src), tmp1 + ld := s.Prog(arm64.AFMOVQ) + ld.From.Type = obj.TYPE_MEM + ld.From.Reg = src + ld.From.Offset = off + ld.To.Type = obj.TYPE_REG + ld.To.Reg = tmp1 + // FMOVQ tmp1, off(dst) + st := s.Prog(arm64.AFMOVQ) + st.From.Type = obj.TYPE_REG + st.From.Reg = tmp1 + st.To.Type = obj.TYPE_MEM + st.To.Reg = dst + st.To.Offset = off if postInc { if off != 0 { panic("can't postinc with non-zero offset") diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go index 43072ae913..cc3758d109 100644 --- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go @@ -144,8 +144,9 @@ func init() { gpspsbg = gpspg | buildReg("SB") fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31") callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g + r25 = buildReg("R25") r24to25 = buildReg("R24 R25") - r23to25 = buildReg("R23 R24 R25") + f16to17 = buildReg("F16 F17") rz = buildReg("ZERO") first16 = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15") ) @@ -599,8 +600,8 @@ func init() { aux: "Int64", argLength: 3, reg: regInfo{ - inputs: []regMask{gp &^ r24to25, gp &^ r24to25}, - clobbers: r24to25, // TODO: figure out needIntTemp x2 + inputs: []regMask{gp &^ r25, gp &^ r25}, + clobbers: r25 | f16to17, // TODO: figure out needIntTemp + x2 for floats }, faultOnNilArg0: true, faultOnNilArg1: true, @@ -617,8 +618,8 @@ func init() { aux: "Int64", argLength: 3, reg: regInfo{ - inputs: []regMask{gp &^ r23to25, gp &^ r23to25}, - clobbers: r23to25, // TODO: figure out needIntTemp x3 + inputs: []regMask{gp &^ r24to25, gp &^ r24to25}, + clobbers: r24to25 | f16to17, // TODO: figure out needIntTemp x2 + x2 for floats clobbersArg0: true, clobbersArg1: true, }, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 9b38e66a23..061f133338 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -23199,10 +23199,10 @@ var opcodeTable = [...]opInfo{ faultOnNilArg1: true, reg: regInfo{ inputs: []inputInfo{ - {0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30 - {1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30 + {0, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30 + {1, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30 }, - clobbers: 25165824, // R24 R25 + clobbers: 422212481843200, // R25 F16 F17 }, }, { @@ -23213,10 +23213,10 @@ var opcodeTable = [...]opInfo{ faultOnNilArg1: true, reg: regInfo{ inputs: []inputInfo{ - {0, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30 - {1, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30 + {0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30 + {1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30 }, - clobbers: 29360128, // R23 R24 R25 + clobbers: 422212490231808, // R24 R25 F16 F17 clobbersArg0: true, clobbersArg1: true, },