From f322d67ced53f413c4b35f41f754fa34f440b012 Mon Sep 17 00:00:00 2001 From: Mauri de Souza Meneguzzo Date: Fri, 28 Jul 2023 19:10:04 +0000 Subject: [PATCH] cmd/asm: add PCALIGN support on 386/amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The PCALIGN asm directive was not supported on 386/amd64, causing a compile-time error when used. The same directive is currently supported on arm64, loong64 and ppc64 architectures. This has potential for noticeable performance improvements on amd64 across multiple packages, I did a quick test aligning a hot loop on bytes.IndexByte: ``` IndexByte/10-16 3.477n ± ∞ ¹ 3.462n ± ∞ ¹ ~ (p=0.198 n=5) IndexByte/32-16 4.675n ± ∞ ¹ 4.834n ± ∞ ¹ +3.40% (p=0.008 n=5) IndexByte/4K-16 67.47n ± ∞ ¹ 44.44n ± ∞ ¹ -34.13% (p=0.008 n=5) IndexByte/4M-16 61.98µ ± ∞ ¹ 45.07µ ± ∞ ¹ -27.28% (p=0.008 n=5) IndexByte/64M-16 1206.6µ ± ∞ ¹ 940.9µ ± ∞ ¹ -22.02% (p=0.008 n=5) IndexBytePortable/10-16 4.064n ± ∞ ¹ 4.044n ± ∞ ¹ ~ (p=0.325 n=5) IndexBytePortable/32-16 9.999n ± ∞ ¹ 9.934n ± ∞ ¹ ~ (p=0.151 n=5) IndexBytePortable/4K-16 975.8n ± ∞ ¹ 965.5n ± ∞ ¹ ~ (p=0.151 n=5) IndexBytePortable/4M-16 973.3µ ± ∞ ¹ 972.3µ ± ∞ ¹ ~ (p=0.222 n=5) IndexBytePortable/64M-16 15.68m ± ∞ ¹ 15.89m ± ∞ ¹ ~ (p=0.310 n=5) geomean 1.478µ 1.342µ -9.20% IndexByte/10-16 2.678Gi ± ∞ ¹ 2.690Gi ± ∞ ¹ ~ (p=0.151 n=5) IndexByte/32-16 6.375Gi ± ∞ ¹ 6.165Gi ± ∞ ¹ -3.30% (p=0.008 n=5) IndexByte/4K-16 56.54Gi ± ∞ ¹ 85.85Gi ± ∞ ¹ +51.83% (p=0.008 n=5) IndexByte/4M-16 63.03Gi ± ∞ ¹ 86.68Gi ± ∞ ¹ +37.52% (p=0.008 n=5) IndexByte/64M-16 51.80Gi ± ∞ ¹ 66.42Gi ± ∞ ¹ +28.23% (p=0.008 n=5) IndexBytePortable/10-16 2.291Gi ± ∞ ¹ 2.303Gi ± ∞ ¹ ~ (p=0.421 n=5) IndexBytePortable/32-16 2.980Gi ± ∞ ¹ 3.000Gi ± ∞ ¹ ~ (p=0.151 n=5) IndexBytePortable/4K-16 3.909Gi ± ∞ ¹ 3.951Gi ± ∞ ¹ ~ (p=0.151 n=5) IndexBytePortable/4M-16 4.013Gi ± ∞ ¹ 4.017Gi ± ∞ ¹ ~ (p=0.222 n=5) IndexBytePortable/64M-16 3.987Gi ± ∞ ¹ 3.933Gi ± ∞ ¹ ~ (p=0.310 n=5) geomean 8.183Gi 9.013Gi +10.14% ``` Fixes #56474 Change-Id: Idea022b1a16e6d4b8dd778723adb862c46602c4f GitHub-Last-Rev: 2eb7e31dc378a02fd83faa7d41239df0f2859677 GitHub-Pull-Request: golang/go#61516 Reviewed-on: https://go-review.googlesource.com/c/go/+/511662 Run-TryBot: Keith Randall Reviewed-by: Cherry Mui TryBot-Result: Gopher Robot Reviewed-by: Keith Randall Reviewed-by: Keith Randall Auto-Submit: Keith Randall --- src/cmd/internal/obj/x86/asm6.go | 38 +++++++++++++++++++++ src/cmd/internal/obj/x86/asm_test.go | 51 ++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index 782b6d4aff..5e988eaf48 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -2036,6 +2036,31 @@ type nopPad struct { n int32 // Size of the pad } +// Padding bytes to add to align code as requested. +// Alignment is restricted to powers of 2 between 8 and 2048 inclusive. +// +// pc: current offset in function, in bytes +// a: requested alignment, in bytes +// cursym: current function being assembled +// returns number of bytes of padding needed +func addpad(pc, a int64, ctxt *obj.Link, cursym *obj.LSym) int { + if !((a&(a-1) == 0) && 8 <= a && a <= 2048) { + ctxt.Diag("alignment value of an instruction must be a power of two and in the range [8, 2048], got %d\n", a) + return 0 + } + + // By default function alignment is 32 bytes for amd64 + if cursym.Func().Align < int32(a) { + cursym.Func().Align = int32(a) + } + + if pc&(a-1) != 0 { + return int(a - (pc & (a - 1))) + } + + return 0 +} + func span6(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { if ctxt.Retpoline && ctxt.Arch.Family == sys.I386 { ctxt.Diag("-spectre=ret not supported on 386") @@ -2119,6 +2144,19 @@ func span6(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { c0 := c c = pjc.padJump(ctxt, s, p, c) + if p.As == obj.APCALIGN { + aln := p.From.Offset + v := addpad(int64(c), aln, ctxt, s) + if v > 0 { + s.Grow(int64(c) + int64(v)) + fillnop(s.P[c:], int(v)) + } + + c += int32(v) + pPrev = p + continue + } + if maxLoopPad > 0 && p.Back&branchLoopHead != 0 && c&(loopAlign-1) != 0 { // pad with NOPs v := -c & (loopAlign - 1) diff --git a/src/cmd/internal/obj/x86/asm_test.go b/src/cmd/internal/obj/x86/asm_test.go index 36c8fce675..458a91258a 100644 --- a/src/cmd/internal/obj/x86/asm_test.go +++ b/src/cmd/internal/obj/x86/asm_test.go @@ -7,6 +7,10 @@ package x86 import ( "cmd/internal/obj" "cmd/internal/objabi" + "internal/testenv" + "os" + "path/filepath" + "regexp" "testing" ) @@ -289,3 +293,50 @@ func TestRegIndex(t *testing.T) { } } } + +// TestPCALIGN verifies the correctness of the PCALIGN by checking if the +// code can be aligned to the alignment value. +func TestPCALIGN(t *testing.T) { + testenv.MustHaveGoBuild(t) + dir := t.TempDir() + tmpfile := filepath.Join(dir, "test.s") + tmpout := filepath.Join(dir, "test.o") + + var testCases = []struct { + name string + code string + out string + }{ + { + name: "8-byte alignment", + code: "TEXT ·foo(SB),$0-0\nMOVQ $0, AX\nPCALIGN $8\nMOVQ $1, BX\nRET\n", + out: `0x0008\s00008\s\(.*\)\tMOVQ\t\$1,\sBX`, + }, + { + name: "16-byte alignment", + code: "TEXT ·foo(SB),$0-0\nMOVQ $0, AX\nPCALIGN $16\nMOVQ $2, CX\nRET\n", + out: `0x0010\s00016\s\(.*\)\tMOVQ\t\$2,\sCX`, + }, + } + + for _, test := range testCases { + if err := os.WriteFile(tmpfile, []byte(test.code), 0644); err != nil { + t.Fatal(err) + } + cmd := testenv.Command(t, testenv.GoToolPath(t), "tool", "asm", "-S", "-o", tmpout, tmpfile) + cmd.Env = append(os.Environ(), "GOARCH=amd64", "GOOS=linux") + out, err := cmd.CombinedOutput() + if err != nil { + t.Errorf("The %s build failed: %v, output: %s", test.name, err, out) + continue + } + + matched, err := regexp.MatchString(test.out, string(out)) + if err != nil { + t.Fatal(err) + } + if !matched { + t.Errorf("The %s testing failed!\ninput: %s\noutput: %s\n", test.name, test.code, out) + } + } +} -- 2.50.0