From: Austin Clements <austin@google.com>
Date: Thu, 8 Apr 2021 21:43:51 +0000 (-0400)
Subject: runtime: port performance-critical functions to regabi
X-Git-Tag: go1.17beta1~711
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=849dba07a5;p=gostls13.git

runtime: port performance-critical functions to regabi

This CL ports a few performance-critical runtime assembly functions to
use register arguments directly. While using the faster ABI is nice,
the real win here is that we avoid ABI wrappers: since these are
"builtin" functions in the compiler, it can generate calls to them
without knowing that their native implementation is ABI0. Hence, it
generates ABIInternal calls that go through ABI wrappers. By porting
them to use ABIInternal natively, we avoid the overhead of the ABI
wrapper.

This significantly improves performance on several benchmarks,
comparing regabiwrappers before and after this change:

name                                old time/op  new time/op  delta
BiogoIgor                            15.7s ± 2%   15.7s ± 2%    ~     (p=0.617 n=25+25)
BiogoKrishna                         18.5s ± 5%   17.7s ± 2%  -4.61%  (p=0.000 n=25+25)
BleveIndexBatch100                   5.91s ± 3%   5.82s ± 3%  -1.60%  (p=0.000 n=25+25)
BleveQuery                           6.76s ± 0%   6.60s ± 1%  -2.31%  (p=0.000 n=22+25)
CompileTemplate                      248ms ± 5%   245ms ± 1%    ~     (p=0.643 n=25+20)
CompileUnicode                      94.4ms ± 3%  93.9ms ± 2%    ~     (p=0.152 n=24+23)
CompileGoTypes                       1.60s ± 2%   1.59s ± 2%    ~     (p=0.059 n=24+24)
CompileCompiler                      104ms ± 3%   103ms ± 1%    ~     (p=0.056 n=25+22)
CompileSSA                           10.9s ± 1%   10.9s ± 1%    ~     (p=0.052 n=25+25)
CompileFlate                         156ms ± 8%   152ms ± 1%  -2.49%  (p=0.008 n=25+21)
CompileGoParser                      248ms ± 1%   249ms ± 2%    ~     (p=0.058 n=21+20)
CompileReflect                       595ms ± 3%   601ms ± 4%    ~     (p=0.182 n=25+25)
CompileTar                           211ms ± 2%   211ms ± 1%    ~     (p=0.663 n=23+23)
CompileXML                           282ms ± 2%   284ms ± 5%    ~     (p=0.456 n=21+23)
CompileStdCmd                        13.6s ± 2%   13.5s ± 2%    ~     (p=0.112 n=25+24)
FoglemanFauxGLRenderRotateBoat       8.69s ± 2%   8.67s ± 0%    ~     (p=0.094 n=22+25)
FoglemanPathTraceRenderGopherIter1   20.2s ± 2%   20.7s ± 3%  +2.53%  (p=0.000 n=24+24)
GopherLuaKNucleotide                 31.4s ± 1%   31.0s ± 1%  -1.28%  (p=0.000 n=25+24)
MarkdownRenderXHTML                  246ms ± 1%   244ms ± 1%  -0.79%  (p=0.000 n=20+21)
Tile38WithinCircle100kmRequest       843µs ± 4%   818µs ± 4%  -2.93%  (p=0.000 n=25+25)
Tile38IntersectsCircle100kmRequest  1.06ms ± 5%  1.05ms ± 3%  -1.19%  (p=0.021 n=24+25)
Tile38KNearestLimit100Request       1.01ms ± 1%  1.01ms ± 2%    ~     (p=0.335 n=22+25)
[Geo mean]                           596ms        592ms       -0.71%

(https://perf.golang.org/search?q=upload:20210411.5)

It also significantly reduces the performance penalty of enabling
regabiwrappers, though it doesn't yet fully close the gap on all
benchmarks:

name                                old time/op  new time/op  delta
BiogoIgor                            15.7s ± 1%   15.7s ± 2%    ~     (p=0.366 n=24+25)
BiogoKrishna                         17.7s ± 2%   17.7s ± 2%    ~     (p=0.315 n=23+25)
BleveIndexBatch100                   5.86s ± 4%   5.82s ± 3%    ~     (p=0.137 n=24+25)
BleveQuery                           6.55s ± 0%   6.60s ± 1%  +0.83%  (p=0.000 n=24+25)
CompileTemplate                      244ms ± 1%   245ms ± 1%    ~     (p=0.208 n=21+20)
CompileUnicode                      94.0ms ± 4%  93.9ms ± 2%    ~     (p=0.666 n=24+23)
CompileGoTypes                       1.60s ± 2%   1.59s ± 2%    ~     (p=0.154 n=25+24)
CompileCompiler                      103ms ± 1%   103ms ± 1%    ~     (p=0.905 n=24+22)
CompileSSA                           10.9s ± 2%   10.9s ± 1%    ~     (p=0.803 n=25+25)
CompileFlate                         153ms ± 1%   152ms ± 1%    ~     (p=0.182 n=23+21)
CompileGoParser                      250ms ± 2%   249ms ± 2%    ~     (p=0.843 n=24+20)
CompileReflect                       595ms ± 4%   601ms ± 4%    ~     (p=0.141 n=25+25)
CompileTar                           212ms ± 3%   211ms ± 1%    ~     (p=0.499 n=23+23)
CompileXML                           282ms ± 1%   284ms ± 5%    ~     (p=0.129 n=20+23)
CompileStdCmd                        13.5s ± 2%   13.5s ± 2%    ~     (p=0.480 n=24+24)
FoglemanFauxGLRenderRotateBoat       8.66s ± 1%   8.67s ± 0%    ~     (p=0.325 n=25+25)
FoglemanPathTraceRenderGopherIter1   20.6s ± 3%   20.7s ± 3%    ~     (p=0.137 n=25+24)
GopherLuaKNucleotide                 30.5s ± 2%   31.0s ± 1%  +1.68%  (p=0.000 n=23+24)
MarkdownRenderXHTML                  243ms ± 1%   244ms ± 1%  +0.51%  (p=0.000 n=23+21)
Tile38WithinCircle100kmRequest       801µs ± 2%   818µs ± 4%  +2.11%  (p=0.000 n=25+25)
Tile38IntersectsCircle100kmRequest  1.01ms ± 2%  1.05ms ± 3%  +4.34%  (p=0.000 n=24+25)
Tile38KNearestLimit100Request       1.00ms ± 1%  1.01ms ± 2%  +0.81%  (p=0.008 n=21+25)
[Geo mean]                           589ms        592ms       +0.50%

(https://perf.golang.org/search?q=upload:20210411.6)

Change-Id: I8f77f010b0abc658064df569a27a9c7a7b1c7bf9
Reviewed-on: https://go-review.googlesource.com/c/go/+/308931
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
---

diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 77f4939b30..e883f20045 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1011,34 +1011,62 @@ done:
 
 // func memhash(p unsafe.Pointer, h, s uintptr) uintptr
 // hash function using AES hardware instructions
-TEXT runtimeÂ·memhash(SB),NOSPLIT,$0-32
+TEXT runtimeÂ·memhash<ABIInternal>(SB),NOSPLIT,$0-32
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to data
+	// BX = seed
+	// CX = size
+#endif
 	CMPB	runtimeÂ·useAeshash(SB), $0
 	JEQ	noaes
+#ifndef GOEXPERIMENT_regabiargs
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	s+16(FP), CX	// size
 	LEAQ	ret+24(FP), DX
+#endif
 	JMP	aeshashbody<>(SB)
 noaes:
-	JMP	runtimeÂ·memhashFallback(SB)
+	JMP	runtimeÂ·memhashFallback<ABIInternal>(SB)
 
 // func strhash(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtimeÂ·strhash(SB),NOSPLIT,$0-24
+TEXT runtimeÂ·strhash<ABIInternal>(SB),NOSPLIT,$0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to string struct
+	// BX = seed
+#endif
 	CMPB	runtimeÂ·useAeshash(SB), $0
 	JEQ	noaes
+#ifndef GOEXPERIMENT_regabiargs
 	MOVQ	p+0(FP), AX	// ptr to string struct
+#endif
 	MOVQ	8(AX), CX	// length of string
 	MOVQ	(AX), AX	// string data
+#ifndef GOEXPERIMENT_regabiargs
 	LEAQ	ret+16(FP), DX
+#endif
 	JMP	aeshashbody<>(SB)
 noaes:
-	JMP	runtimeÂ·strhashFallback(SB)
+	JMP	runtimeÂ·strhashFallback<ABIInternal>(SB)
 
 // AX: data
+#ifdef GOEXPERIMENT_regabiargs
+// BX: hash seed
+#else
+// h+8(FP): hash seed
+#endif
 // CX: length
+#ifdef GOEXPERIMENT_regabiargs
+// At return: AX = return value
+#else
 // DX: address to put return value
+#endif
 TEXT aeshashbody<>(SB),NOSPLIT,$0-0
 	// Fill an SSE register with our seeds.
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	BX, X0				// 64 bits of per-table hash seed
+#else
 	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
+#endif
 	PINSRW	$4, CX, X0			// 16 bits of length
 	PSHUFHW $0, X0, X0			// repeat length 4 times total
 	MOVO	X0, X1				// save unscrambled seed
@@ -1075,7 +1103,11 @@ final1:
 	AESENC	X1, X1	// scramble combo 3 times
 	AESENC	X1, X1
 	AESENC	X1, X1
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X1, AX	// return X1
+#else
 	MOVQ	X1, (DX)
+#endif
 	RET
 
 endofpage:
@@ -1091,7 +1123,11 @@ endofpage:
 aes0:
 	// Return scrambled input seed
 	AESENC	X0, X0
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X0, AX	// return X0
+#else
 	MOVQ	X0, (DX)
+#endif
 	RET
 
 aes16:
@@ -1121,7 +1157,11 @@ aes17to32:
 
 	// combine results
 	PXOR	X3, X2
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X2, AX	// return X2
+#else
 	MOVQ	X2, (DX)
+#endif
 	RET
 
 aes33to64:
@@ -1163,7 +1203,11 @@ aes33to64:
 	PXOR	X6, X4
 	PXOR	X7, X5
 	PXOR	X5, X4
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X4, AX	// return X4
+#else
 	MOVQ	X4, (DX)
+#endif
 	RET
 
 aes65to128:
@@ -1245,7 +1289,15 @@ aes65to128:
 	PXOR	X10, X8
 	PXOR	X11, X9
 	PXOR	X9, X8
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X8, AX	// return X8
+#else
 	MOVQ	X8, (DX)
+#endif
 	RET
 
 aes129plus:
@@ -1361,38 +1413,73 @@ aesloop:
 	PXOR	X10, X8
 	PXOR	X11, X9
 	PXOR	X9, X8
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X8, AX	// return X8
+#else
 	MOVQ	X8, (DX)
+#endif
 	RET
 
 // func memhash32(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtimeÂ·memhash32(SB),NOSPLIT,$0-24
+// ABIInternal for performance.
+TEXT runtimeÂ·memhash32<ABIInternal>(SB),NOSPLIT,$0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to data
+	// BX = seed
+#endif
 	CMPB	runtimeÂ·useAeshash(SB), $0
 	JEQ	noaes
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	BX, X0	// X0 = seed
+#else
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	h+8(FP), X0	// seed
+#endif
 	PINSRD	$2, (AX), X0	// data
 	AESENC	runtimeÂ·aeskeysched+0(SB), X0
 	AESENC	runtimeÂ·aeskeysched+16(SB), X0
 	AESENC	runtimeÂ·aeskeysched+32(SB), X0
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X0, AX	// return X0
+#else
 	MOVQ	X0, ret+16(FP)
+#endif
 	RET
 noaes:
-	JMP	runtimeÂ·memhash32Fallback(SB)
+	JMP	runtimeÂ·memhash32Fallback<ABIInternal>(SB)
 
 // func memhash64(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtimeÂ·memhash64(SB),NOSPLIT,$0-24
+// ABIInternal for performance.
+TEXT runtimeÂ·memhash64<ABIInternal>(SB),NOSPLIT,$0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to data
+	// BX = seed
+#else
+#endif
 	CMPB	runtimeÂ·useAeshash(SB), $0
 	JEQ	noaes
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	BX, X0	// X0 = seed
+#else
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	h+8(FP), X0	// seed
+#endif
 	PINSRQ	$1, (AX), X0	// data
 	AESENC	runtimeÂ·aeskeysched+0(SB), X0
 	AESENC	runtimeÂ·aeskeysched+16(SB), X0
 	AESENC	runtimeÂ·aeskeysched+32(SB), X0
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X0, AX	// return X0
+#else
 	MOVQ	X0, ret+16(FP)
+#endif
 	RET
 noaes:
-	JMP	runtimeÂ·memhash64Fallback(SB)
+	JMP	runtimeÂ·memhash64Fallback<ABIInternal>(SB)
 
 // simple mask to get rid of data in the high part of the register.
 DATA masks<>+0x00(SB)/8, $0x0000000000000000
diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s
index 37fe9745b1..b4bc9988ec 100644
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s
@@ -12,9 +12,16 @@
 // See memclrNoHeapPointers Go doc for important implementation constraints.
 
 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
-TEXT runtimeÂ·memclrNoHeapPointers(SB), NOSPLIT, $0-16
+// ABIInternal for performance.
+TEXT runtimeÂ·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr
+	// BX = n
+	MOVQ	AX, DI	// DI = ptr
+#else
 	MOVQ	ptr+0(FP), DI
 	MOVQ	n+8(FP), BX
+#endif
 	XORQ	AX, AX
 
 	// MOVOU seems always faster than REP STOSQ.
@@ -31,7 +38,9 @@ tail:
 	JE	_8
 	CMPQ	BX, $16
 	JBE	_9through16
-	PXOR	X0, X0
+#ifndef GOEXPERIMENT_regabig
+	PXOR	X15, X15
+#endif
 	CMPQ	BX, $32
 	JBE	_17through32
 	CMPQ	BX, $64
@@ -45,22 +54,22 @@ tail:
 	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
 
 loop:
-	MOVOU	X0, 0(DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, 64(DI)
-	MOVOU	X0, 80(DI)
-	MOVOU	X0, 96(DI)
-	MOVOU	X0, 112(DI)
-	MOVOU	X0, 128(DI)
-	MOVOU	X0, 144(DI)
-	MOVOU	X0, 160(DI)
-	MOVOU	X0, 176(DI)
-	MOVOU	X0, 192(DI)
-	MOVOU	X0, 208(DI)
-	MOVOU	X0, 224(DI)
-	MOVOU	X0, 240(DI)
+	MOVOU	X15, 0(DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, 64(DI)
+	MOVOU	X15, 80(DI)
+	MOVOU	X15, 96(DI)
+	MOVOU	X15, 112(DI)
+	MOVOU	X15, 128(DI)
+	MOVOU	X15, 144(DI)
+	MOVOU	X15, 160(DI)
+	MOVOU	X15, 176(DI)
+	MOVOU	X15, 192(DI)
+	MOVOU	X15, 208(DI)
+	MOVOU	X15, 224(DI)
+	MOVOU	X15, 240(DI)
 	SUBQ	$256, BX
 	ADDQ	$256, DI
 	CMPQ	BX, $256
@@ -141,40 +150,40 @@ _9through16:
 	MOVQ	AX, -8(DI)(BX*1)
 	RET
 _17through32:
-	MOVOU	X0, (DI)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _33through64:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _65through128:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, -64(DI)(BX*1)
-	MOVOU	X0, -48(DI)(BX*1)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, -64(DI)(BX*1)
+	MOVOU	X15, -48(DI)(BX*1)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _129through256:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, 64(DI)
-	MOVOU	X0, 80(DI)
-	MOVOU	X0, 96(DI)
-	MOVOU	X0, 112(DI)
-	MOVOU	X0, -128(DI)(BX*1)
-	MOVOU	X0, -112(DI)(BX*1)
-	MOVOU	X0, -96(DI)(BX*1)
-	MOVOU	X0, -80(DI)(BX*1)
-	MOVOU	X0, -64(DI)(BX*1)
-	MOVOU	X0, -48(DI)(BX*1)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, 64(DI)
+	MOVOU	X15, 80(DI)
+	MOVOU	X15, 96(DI)
+	MOVOU	X15, 112(DI)
+	MOVOU	X15, -128(DI)(BX*1)
+	MOVOU	X15, -112(DI)(BX*1)
+	MOVOU	X15, -96(DI)(BX*1)
+	MOVOU	X15, -80(DI)(BX*1)
+	MOVOU	X15, -64(DI)(BX*1)
+	MOVOU	X15, -48(DI)(BX*1)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index d91641a8e8..f1e3403596 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@@ -31,11 +31,20 @@
 // See memmove Go doc for important implementation constraints.
 
 // func memmove(to, from unsafe.Pointer, n uintptr)
-TEXT runtimeÂ·memmove(SB), NOSPLIT, $0-24
-
+// ABIInternal for performance.
+TEXT runtimeÂ·memmove<ABIInternal>(SB), NOSPLIT, $0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = to
+	// BX = from
+	// CX = n
+	MOVQ	AX, DI
+	MOVQ	BX, SI
+	MOVQ	CX, BX
+#else
 	MOVQ	to+0(FP), DI
 	MOVQ	from+8(FP), SI
 	MOVQ	n+16(FP), BX
+#endif
 
 	// REP instructions have a high startup cost, so we handle small sizes
 	// with some straightline code. The REP MOVSQ instruction is really fast
@@ -244,6 +253,10 @@ move_129through256:
 	MOVOU	X13, -48(DI)(BX*1)
 	MOVOU	X14, -32(DI)(BX*1)
 	MOVOU	X15, -16(DI)(BX*1)
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
 	RET
 move_256through2048:
 	SUBQ	$256, BX
@@ -283,6 +296,10 @@ move_256through2048:
 	LEAQ	256(SI), SI
 	LEAQ	256(DI), DI
 	JGE	move_256through2048
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
 	JMP	tail
 
 avxUnaligned:
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index f635d942e4..16d7583202 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -109,6 +109,9 @@ func reflect_memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) {
 //go:noescape
 func memmove(to, from unsafe.Pointer, n uintptr)
 
+// Outside assembly calls memmove. Make sure it has ABI wrappers.
+//go:linkname memmove
+
 //go:linkname reflect_memmove reflect.memmove
 func reflect_memmove(to, from unsafe.Pointer, n uintptr) {
 	memmove(to, from, n)