From f736a9ad0105b7906636aab43df96123d0f32d70 Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git@xen0n.name>
Date: Tue, 28 Mar 2023 21:10:16 +0800
Subject: [PATCH] cmd/internal/obj/loong64: auto-align loop heads to 16-byte
 boundaries
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

CL 479816 took care of loops in hand-written assembly, but did not
account for those written in Go, that may become performance-sensitive
as well.

In this patch, all loop heads are automatically identified and aligned
to 16-byte boundaries, by inserting a synthetic `PCALIGN $16` before
them. "Loop heads" are defined as targets of backward branches.

While at it, tweak some of the local comments so the flow is hopefully
clearer.

Because LoongArch instructions are all 32 bits long, at most 3 NOOPs
can be inserted for each target Prog. This may sound excessive, but
benchmark results indicate the current approach is overall profitable
anyway.

Benchmark results on Loongson 3A5000 (LA464):

goos: linux
goarch: loong64
pkg: test/bench/go1
                      â  CL 479816  â              this CL               â
                      â   sec/op    â   sec/op     vs base               â
BinaryTree17             14.10 Â± 1%    14.06 Â± 1%       ~ (p=0.280 n=10)
Fannkuch11               3.579 Â± 0%    3.419 Â± 0%  -4.45% (p=0.000 n=10)
FmtFprintfEmpty         94.73n Â± 0%   94.44n Â± 0%  -0.31% (p=0.000 n=10)
FmtFprintfString        151.9n Â± 0%   149.1n Â± 0%  -1.84% (p=0.000 n=10)
FmtFprintfInt           158.3n Â± 0%   155.2n Â± 0%  -1.96% (p=0.000 n=10)
FmtFprintfIntInt        241.4n Â± 0%   235.4n Â± 0%  -2.49% (p=0.000 n=10)
FmtFprintfPrefixedInt   320.2n Â± 0%   314.7n Â± 0%  -1.73% (p=0.000 n=10)
FmtFprintfFloat         414.3n Â± 0%   398.7n Â± 0%  -3.77% (p=0.000 n=10)
FmtManyArgs             949.9n Â± 0%   929.8n Â± 0%  -2.12% (p=0.000 n=10)
GobDecode               15.24m Â± 0%   15.30m Â± 0%  +0.38% (p=0.035 n=10)
GobEncode               18.10m Â± 2%   17.59m Â± 1%  -2.81% (p=0.002 n=10)
Gzip                    429.9m Â± 0%   421.5m Â± 0%  -1.97% (p=0.000 n=10)
Gunzip                  88.31m Â± 0%   87.39m Â± 0%  -1.04% (p=0.000 n=10)
HTTPClientServer        85.71Âµ Â± 0%   87.24Âµ Â± 0%  +1.79% (p=0.000 n=10)
JSONEncode              19.74m Â± 0%   18.55m Â± 0%  -6.00% (p=0.000 n=10)
JSONDecode              78.60m Â± 1%   77.93m Â± 0%  -0.84% (p=0.000 n=10)
Mandelbrot200           7.208m Â± 0%   7.217m Â± 0%       ~ (p=0.481 n=10)
GoParse                 7.616m Â± 1%   7.630m Â± 2%       ~ (p=0.796 n=10)
RegexpMatchEasy0_32     133.0n Â± 0%   134.1n Â± 0%  +0.83% (p=0.000 n=10)
RegexpMatchEasy0_1K     1.362Âµ Â± 0%   1.364Âµ Â± 0%  +0.15% (p=0.000 n=10)
RegexpMatchEasy1_32     161.8n Â± 0%   163.7n Â± 0%  +1.17% (p=0.000 n=10)
RegexpMatchEasy1_1K     1.497Âµ Â± 0%   1.497Âµ Â± 0%       ~ (p=1.000 n=10)
RegexpMatchMedium_32    1.420Âµ Â± 0%   1.446Âµ Â± 0%  +1.83% (p=0.000 n=10)
RegexpMatchMedium_1K    42.25Âµ Â± 0%   42.53Âµ Â± 0%  +0.65% (p=0.000 n=10)
RegexpMatchHard_32      2.108Âµ Â± 0%   2.116Âµ Â± 0%  +0.38% (p=0.000 n=10)
RegexpMatchHard_1K      62.65Âµ Â± 0%   63.23Âµ Â± 0%  +0.93% (p=0.000 n=10)
Revcomp                  1.192 Â± 0%    1.198 Â± 0%  +0.55% (p=0.000 n=10)
Template                115.6m Â± 2%   116.9m Â± 1%       ~ (p=0.075 n=10)
TimeParse               418.1n Â± 1%   414.7n Â± 0%  -0.81% (p=0.000 n=10)
TimeFormat              517.9n Â± 0%   513.7n Â± 0%  -0.81% (p=0.000 n=10)
geomean                 103.5Âµ        102.6Âµ       -0.79%

                     â  CL 479816   â               this CL               â
                     â     B/s      â     B/s       vs base               â
GobDecode              48.04Mi Â± 0%   47.86Mi Â± 0%  -0.38% (p=0.035 n=10)
GobEncode              40.44Mi Â± 2%   41.61Mi Â± 1%  +2.89% (p=0.001 n=10)
Gzip                   43.04Mi Â± 0%   43.91Mi Â± 0%  +2.02% (p=0.000 n=10)
Gunzip                 209.6Mi Â± 0%   211.8Mi Â± 0%  +1.05% (p=0.000 n=10)
JSONEncode             93.76Mi Â± 0%   99.75Mi Â± 0%  +6.39% (p=0.000 n=10)
JSONDecode             23.55Mi Â± 1%   23.75Mi Â± 0%  +0.85% (p=0.000 n=10)
GoParse                7.253Mi Â± 1%   7.238Mi Â± 2%       ~ (p=0.698 n=10)
RegexpMatchEasy0_32    229.4Mi Â± 0%   227.6Mi Â± 0%  -0.82% (p=0.000 n=10)
RegexpMatchEasy0_1K    717.3Mi Â± 0%   716.2Mi Â± 0%  -0.15% (p=0.000 n=10)
RegexpMatchEasy1_32    188.6Mi Â± 0%   186.4Mi Â± 0%  -1.13% (p=0.000 n=10)
RegexpMatchEasy1_1K    652.2Mi Â± 0%   652.3Mi Â± 0%  +0.01% (p=0.005 n=10)
RegexpMatchMedium_32   21.49Mi Â± 0%   21.11Mi Â± 0%  -1.73% (p=0.000 n=10)
RegexpMatchMedium_1K   23.11Mi Â± 0%   22.96Mi Â± 0%  -0.62% (p=0.000 n=10)
RegexpMatchHard_32     14.48Mi Â± 0%   14.42Mi Â± 0%  -0.40% (p=0.000 n=10)
RegexpMatchHard_1K     15.59Mi Â± 0%   15.44Mi Â± 0%  -0.98% (p=0.000 n=10)
Revcomp                203.4Mi Â± 0%   202.3Mi Â± 0%  -0.55% (p=0.000 n=10)
Template               16.00Mi Â± 2%   15.83Mi Â± 1%       ~ (p=0.078 n=10)
geomean                60.72Mi        60.89Mi       +0.29%

The slight regression on the Regexp cases is likely because the previous
numbers are just coincidental: indeed, large regressions or improvements
(of roughly Â±10%) happen with definitely irrelevant changes during
development. This CL should (hopefully) bring such random performance
fluctuations down a bit.

Change-Id: I8bdda6e65336da00d4ad79650937b3eeb9db0e7c
Reviewed-on: https://go-review.googlesource.com/c/go/+/479817
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: David Chase <drchase@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: WANG Xuerui <git@xen0n.name>
---
 src/cmd/internal/obj/loong64/asm.go | 55 ++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 8 deletions(-)

diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
index 75b9302f24..fe2c91becf 100644
--- a/src/cmd/internal/obj/loong64/asm.go
+++ b/src/cmd/internal/obj/loong64/asm.go
@@ -28,6 +28,7 @@ type ctxt0 struct {
 
 const (
 	FuncAlign = 4
+	loopAlign = 16
 )
 
 type Optab struct {
@@ -45,6 +46,10 @@ type Optab struct {
 
 const (
 	NOTUSETMP = 1 << iota // p expands to multiple instructions, but does NOT use REGTMP
+
+	// branchLoopHead marks loop entry.
+	// Used to insert padding for under-aligned loops.
+	branchLoopHead
 )
 
 var optab = []Optab{
@@ -421,24 +426,58 @@ func span0(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 
 	c.cursym.Size = pc
 
-	/*
-	 * if any procedure is large enough to
-	 * generate a large SBRA branch, then
-	 * generate extra passes putting branches
-	 * around jmps to fix. this is rare.
-	 */
-	bflag := 1
+	// mark loop entry instructions for padding
+	// loop entrances are defined as targets of backward branches
+	for p = c.cursym.Func().Text.Link; p != nil; p = p.Link {
+		if q := p.To.Target(); q != nil && q.Pc < p.Pc {
+			q.Mark |= branchLoopHead
+		}
+	}
 
+	// Run these passes until convergence.
+	bflag := 1
 	var otxt int64
 	var q *obj.Prog
 	for bflag != 0 {
 		bflag = 0
 		pc = 0
-		for p = c.cursym.Func().Text.Link; p != nil; p = p.Link {
+		prev := c.cursym.Func().Text
+		for p = prev.Link; p != nil; prev, p = p, p.Link {
 			p.Pc = pc
 			o = c.oplook(p)
 
+			// Prepend a PCALIGN $loopAlign to each of the loop heads
+			// that need padding, if not already done so (because this
+			// pass may execute more than once).
+			//
+			// This needs to come before any pass that look at pc,
+			// because pc will be adjusted if padding happens.
+			if p.Mark&branchLoopHead != 0 && pc&(loopAlign-1) != 0 &&
+				!(prev.As == obj.APCALIGN && prev.From.Offset >= loopAlign) {
+				q = c.newprog()
+				prev.Link = q
+				q.Link = p
+				q.Pc = pc
+				q.As = obj.APCALIGN
+				q.From.Type = obj.TYPE_CONST
+				q.From.Offset = loopAlign
+				// Don't associate the synthesized PCALIGN with
+				// the original source position, for deterministic
+				// mapping between source and corresponding asm.
+				// q.Pos = p.Pos
+
+				// Manually make the PCALIGN come into effect,
+				// since this loop iteration is for p.
+				pc += int64(pcAlignPadLength(ctxt, pc, loopAlign))
+				p.Pc = pc
+			}
+
 			// very large conditional branches
+			//
+			// if any procedure is large enough to
+			// generate a large SBRA branch, then
+			// generate extra passes putting branches
+			// around jmps to fix. this is rare.
 			if o.type_ == 6 && p.To.Target() != nil {
 				otxt = p.To.Target().Pc - pc
 				if otxt < -(1<<17)+10 || otxt >= (1<<17)-10 {
-- 
2.51.0