From: Austin Clements Date: Thu, 8 Apr 2021 21:43:51 +0000 (-0400) Subject: runtime: port performance-critical functions to regabi X-Git-Tag: go1.17beta1~711 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=849dba07a5;p=gostls13.git runtime: port performance-critical functions to regabi This CL ports a few performance-critical runtime assembly functions to use register arguments directly. While using the faster ABI is nice, the real win here is that we avoid ABI wrappers: since these are "builtin" functions in the compiler, it can generate calls to them without knowing that their native implementation is ABI0. Hence, it generates ABIInternal calls that go through ABI wrappers. By porting them to use ABIInternal natively, we avoid the overhead of the ABI wrapper. This significantly improves performance on several benchmarks, comparing regabiwrappers before and after this change: name old time/op new time/op delta BiogoIgor 15.7s ± 2% 15.7s ± 2% ~ (p=0.617 n=25+25) BiogoKrishna 18.5s ± 5% 17.7s ± 2% -4.61% (p=0.000 n=25+25) BleveIndexBatch100 5.91s ± 3% 5.82s ± 3% -1.60% (p=0.000 n=25+25) BleveQuery 6.76s ± 0% 6.60s ± 1% -2.31% (p=0.000 n=22+25) CompileTemplate 248ms ± 5% 245ms ± 1% ~ (p=0.643 n=25+20) CompileUnicode 94.4ms ± 3% 93.9ms ± 2% ~ (p=0.152 n=24+23) CompileGoTypes 1.60s ± 2% 1.59s ± 2% ~ (p=0.059 n=24+24) CompileCompiler 104ms ± 3% 103ms ± 1% ~ (p=0.056 n=25+22) CompileSSA 10.9s ± 1% 10.9s ± 1% ~ (p=0.052 n=25+25) CompileFlate 156ms ± 8% 152ms ± 1% -2.49% (p=0.008 n=25+21) CompileGoParser 248ms ± 1% 249ms ± 2% ~ (p=0.058 n=21+20) CompileReflect 595ms ± 3% 601ms ± 4% ~ (p=0.182 n=25+25) CompileTar 211ms ± 2% 211ms ± 1% ~ (p=0.663 n=23+23) CompileXML 282ms ± 2% 284ms ± 5% ~ (p=0.456 n=21+23) CompileStdCmd 13.6s ± 2% 13.5s ± 2% ~ (p=0.112 n=25+24) FoglemanFauxGLRenderRotateBoat 8.69s ± 2% 8.67s ± 0% ~ (p=0.094 n=22+25) FoglemanPathTraceRenderGopherIter1 20.2s ± 2% 20.7s ± 3% +2.53% (p=0.000 n=24+24) GopherLuaKNucleotide 31.4s ± 1% 31.0s ± 1% -1.28% (p=0.000 n=25+24) MarkdownRenderXHTML 246ms ± 1% 244ms ± 1% -0.79% (p=0.000 n=20+21) Tile38WithinCircle100kmRequest 843µs ± 4% 818µs ± 4% -2.93% (p=0.000 n=25+25) Tile38IntersectsCircle100kmRequest 1.06ms ± 5% 1.05ms ± 3% -1.19% (p=0.021 n=24+25) Tile38KNearestLimit100Request 1.01ms ± 1% 1.01ms ± 2% ~ (p=0.335 n=22+25) [Geo mean] 596ms 592ms -0.71% (https://perf.golang.org/search?q=upload:20210411.5) It also significantly reduces the performance penalty of enabling regabiwrappers, though it doesn't yet fully close the gap on all benchmarks: name old time/op new time/op delta BiogoIgor 15.7s ± 1% 15.7s ± 2% ~ (p=0.366 n=24+25) BiogoKrishna 17.7s ± 2% 17.7s ± 2% ~ (p=0.315 n=23+25) BleveIndexBatch100 5.86s ± 4% 5.82s ± 3% ~ (p=0.137 n=24+25) BleveQuery 6.55s ± 0% 6.60s ± 1% +0.83% (p=0.000 n=24+25) CompileTemplate 244ms ± 1% 245ms ± 1% ~ (p=0.208 n=21+20) CompileUnicode 94.0ms ± 4% 93.9ms ± 2% ~ (p=0.666 n=24+23) CompileGoTypes 1.60s ± 2% 1.59s ± 2% ~ (p=0.154 n=25+24) CompileCompiler 103ms ± 1% 103ms ± 1% ~ (p=0.905 n=24+22) CompileSSA 10.9s ± 2% 10.9s ± 1% ~ (p=0.803 n=25+25) CompileFlate 153ms ± 1% 152ms ± 1% ~ (p=0.182 n=23+21) CompileGoParser 250ms ± 2% 249ms ± 2% ~ (p=0.843 n=24+20) CompileReflect 595ms ± 4% 601ms ± 4% ~ (p=0.141 n=25+25) CompileTar 212ms ± 3% 211ms ± 1% ~ (p=0.499 n=23+23) CompileXML 282ms ± 1% 284ms ± 5% ~ (p=0.129 n=20+23) CompileStdCmd 13.5s ± 2% 13.5s ± 2% ~ (p=0.480 n=24+24) FoglemanFauxGLRenderRotateBoat 8.66s ± 1% 8.67s ± 0% ~ (p=0.325 n=25+25) FoglemanPathTraceRenderGopherIter1 20.6s ± 3% 20.7s ± 3% ~ (p=0.137 n=25+24) GopherLuaKNucleotide 30.5s ± 2% 31.0s ± 1% +1.68% (p=0.000 n=23+24) MarkdownRenderXHTML 243ms ± 1% 244ms ± 1% +0.51% (p=0.000 n=23+21) Tile38WithinCircle100kmRequest 801µs ± 2% 818µs ± 4% +2.11% (p=0.000 n=25+25) Tile38IntersectsCircle100kmRequest 1.01ms ± 2% 1.05ms ± 3% +4.34% (p=0.000 n=24+25) Tile38KNearestLimit100Request 1.00ms ± 1% 1.01ms ± 2% +0.81% (p=0.008 n=21+25) [Geo mean] 589ms 592ms +0.50% (https://perf.golang.org/search?q=upload:20210411.6) Change-Id: I8f77f010b0abc658064df569a27a9c7a7b1c7bf9 Reviewed-on: https://go-review.googlesource.com/c/go/+/308931 Trust: Austin Clements Run-TryBot: Austin Clements Reviewed-by: Cherry Zhang TryBot-Result: Go Bot --- diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 77f4939b30..e883f20045 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1011,34 +1011,62 @@ done: // func memhash(p unsafe.Pointer, h, s uintptr) uintptr // hash function using AES hardware instructions -TEXT runtime·memhash(SB),NOSPLIT,$0-32 +TEXT runtime·memhash(SB),NOSPLIT,$0-32 +#ifdef GOEXPERIMENT_regabiargs + // AX = ptr to data + // BX = seed + // CX = size +#endif CMPB runtime·useAeshash(SB), $0 JEQ noaes +#ifndef GOEXPERIMENT_regabiargs MOVQ p+0(FP), AX // ptr to data MOVQ s+16(FP), CX // size LEAQ ret+24(FP), DX +#endif JMP aeshashbody<>(SB) noaes: - JMP runtime·memhashFallback(SB) + JMP runtime·memhashFallback(SB) // func strhash(p unsafe.Pointer, h uintptr) uintptr -TEXT runtime·strhash(SB),NOSPLIT,$0-24 +TEXT runtime·strhash(SB),NOSPLIT,$0-24 +#ifdef GOEXPERIMENT_regabiargs + // AX = ptr to string struct + // BX = seed +#endif CMPB runtime·useAeshash(SB), $0 JEQ noaes +#ifndef GOEXPERIMENT_regabiargs MOVQ p+0(FP), AX // ptr to string struct +#endif MOVQ 8(AX), CX // length of string MOVQ (AX), AX // string data +#ifndef GOEXPERIMENT_regabiargs LEAQ ret+16(FP), DX +#endif JMP aeshashbody<>(SB) noaes: - JMP runtime·strhashFallback(SB) + JMP runtime·strhashFallback(SB) // AX: data +#ifdef GOEXPERIMENT_regabiargs +// BX: hash seed +#else +// h+8(FP): hash seed +#endif // CX: length +#ifdef GOEXPERIMENT_regabiargs +// At return: AX = return value +#else // DX: address to put return value +#endif TEXT aeshashbody<>(SB),NOSPLIT,$0-0 // Fill an SSE register with our seeds. +#ifdef GOEXPERIMENT_regabiargs + MOVQ BX, X0 // 64 bits of per-table hash seed +#else MOVQ h+8(FP), X0 // 64 bits of per-table hash seed +#endif PINSRW $4, CX, X0 // 16 bits of length PSHUFHW $0, X0, X0 // repeat length 4 times total MOVO X0, X1 // save unscrambled seed @@ -1075,7 +1103,11 @@ final1: AESENC X1, X1 // scramble combo 3 times AESENC X1, X1 AESENC X1, X1 +#ifdef GOEXPERIMENT_regabiargs + MOVQ X1, AX // return X1 +#else MOVQ X1, (DX) +#endif RET endofpage: @@ -1091,7 +1123,11 @@ endofpage: aes0: // Return scrambled input seed AESENC X0, X0 +#ifdef GOEXPERIMENT_regabiargs + MOVQ X0, AX // return X0 +#else MOVQ X0, (DX) +#endif RET aes16: @@ -1121,7 +1157,11 @@ aes17to32: // combine results PXOR X3, X2 +#ifdef GOEXPERIMENT_regabiargs + MOVQ X2, AX // return X2 +#else MOVQ X2, (DX) +#endif RET aes33to64: @@ -1163,7 +1203,11 @@ aes33to64: PXOR X6, X4 PXOR X7, X5 PXOR X5, X4 +#ifdef GOEXPERIMENT_regabiargs + MOVQ X4, AX // return X4 +#else MOVQ X4, (DX) +#endif RET aes65to128: @@ -1245,7 +1289,15 @@ aes65to128: PXOR X10, X8 PXOR X11, X9 PXOR X9, X8 +#ifdef GOEXPERIMENT_regabig + // X15 must be zero on return + PXOR X15, X15 +#endif +#ifdef GOEXPERIMENT_regabiargs + MOVQ X8, AX // return X8 +#else MOVQ X8, (DX) +#endif RET aes129plus: @@ -1361,38 +1413,73 @@ aesloop: PXOR X10, X8 PXOR X11, X9 PXOR X9, X8 +#ifdef GOEXPERIMENT_regabig + // X15 must be zero on return + PXOR X15, X15 +#endif +#ifdef GOEXPERIMENT_regabiargs + MOVQ X8, AX // return X8 +#else MOVQ X8, (DX) +#endif RET // func memhash32(p unsafe.Pointer, h uintptr) uintptr -TEXT runtime·memhash32(SB),NOSPLIT,$0-24 +// ABIInternal for performance. +TEXT runtime·memhash32(SB),NOSPLIT,$0-24 +#ifdef GOEXPERIMENT_regabiargs + // AX = ptr to data + // BX = seed +#endif CMPB runtime·useAeshash(SB), $0 JEQ noaes +#ifdef GOEXPERIMENT_regabiargs + MOVQ BX, X0 // X0 = seed +#else MOVQ p+0(FP), AX // ptr to data MOVQ h+8(FP), X0 // seed +#endif PINSRD $2, (AX), X0 // data AESENC runtime·aeskeysched+0(SB), X0 AESENC runtime·aeskeysched+16(SB), X0 AESENC runtime·aeskeysched+32(SB), X0 +#ifdef GOEXPERIMENT_regabiargs + MOVQ X0, AX // return X0 +#else MOVQ X0, ret+16(FP) +#endif RET noaes: - JMP runtime·memhash32Fallback(SB) + JMP runtime·memhash32Fallback(SB) // func memhash64(p unsafe.Pointer, h uintptr) uintptr -TEXT runtime·memhash64(SB),NOSPLIT,$0-24 +// ABIInternal for performance. +TEXT runtime·memhash64(SB),NOSPLIT,$0-24 +#ifdef GOEXPERIMENT_regabiargs + // AX = ptr to data + // BX = seed +#else +#endif CMPB runtime·useAeshash(SB), $0 JEQ noaes +#ifdef GOEXPERIMENT_regabiargs + MOVQ BX, X0 // X0 = seed +#else MOVQ p+0(FP), AX // ptr to data MOVQ h+8(FP), X0 // seed +#endif PINSRQ $1, (AX), X0 // data AESENC runtime·aeskeysched+0(SB), X0 AESENC runtime·aeskeysched+16(SB), X0 AESENC runtime·aeskeysched+32(SB), X0 +#ifdef GOEXPERIMENT_regabiargs + MOVQ X0, AX // return X0 +#else MOVQ X0, ret+16(FP) +#endif RET noaes: - JMP runtime·memhash64Fallback(SB) + JMP runtime·memhash64Fallback(SB) // simple mask to get rid of data in the high part of the register. DATA masks<>+0x00(SB)/8, $0x0000000000000000 diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s index 37fe9745b1..b4bc9988ec 100644 --- a/src/runtime/memclr_amd64.s +++ b/src/runtime/memclr_amd64.s @@ -12,9 +12,16 @@ // See memclrNoHeapPointers Go doc for important implementation constraints. // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) -TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16 +// ABIInternal for performance. +TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16 +#ifdef GOEXPERIMENT_regabiargs + // AX = ptr + // BX = n + MOVQ AX, DI // DI = ptr +#else MOVQ ptr+0(FP), DI MOVQ n+8(FP), BX +#endif XORQ AX, AX // MOVOU seems always faster than REP STOSQ. @@ -31,7 +38,9 @@ tail: JE _8 CMPQ BX, $16 JBE _9through16 - PXOR X0, X0 +#ifndef GOEXPERIMENT_regabig + PXOR X15, X15 +#endif CMPQ BX, $32 JBE _17through32 CMPQ BX, $64 @@ -45,22 +54,22 @@ tail: // TODO: for really big clears, use MOVNTDQ, even without AVX2. loop: - MOVOU X0, 0(DI) - MOVOU X0, 16(DI) - MOVOU X0, 32(DI) - MOVOU X0, 48(DI) - MOVOU X0, 64(DI) - MOVOU X0, 80(DI) - MOVOU X0, 96(DI) - MOVOU X0, 112(DI) - MOVOU X0, 128(DI) - MOVOU X0, 144(DI) - MOVOU X0, 160(DI) - MOVOU X0, 176(DI) - MOVOU X0, 192(DI) - MOVOU X0, 208(DI) - MOVOU X0, 224(DI) - MOVOU X0, 240(DI) + MOVOU X15, 0(DI) + MOVOU X15, 16(DI) + MOVOU X15, 32(DI) + MOVOU X15, 48(DI) + MOVOU X15, 64(DI) + MOVOU X15, 80(DI) + MOVOU X15, 96(DI) + MOVOU X15, 112(DI) + MOVOU X15, 128(DI) + MOVOU X15, 144(DI) + MOVOU X15, 160(DI) + MOVOU X15, 176(DI) + MOVOU X15, 192(DI) + MOVOU X15, 208(DI) + MOVOU X15, 224(DI) + MOVOU X15, 240(DI) SUBQ $256, BX ADDQ $256, DI CMPQ BX, $256 @@ -141,40 +150,40 @@ _9through16: MOVQ AX, -8(DI)(BX*1) RET _17through32: - MOVOU X0, (DI) - MOVOU X0, -16(DI)(BX*1) + MOVOU X15, (DI) + MOVOU X15, -16(DI)(BX*1) RET _33through64: - MOVOU X0, (DI) - MOVOU X0, 16(DI) - MOVOU X0, -32(DI)(BX*1) - MOVOU X0, -16(DI)(BX*1) + MOVOU X15, (DI) + MOVOU X15, 16(DI) + MOVOU X15, -32(DI)(BX*1) + MOVOU X15, -16(DI)(BX*1) RET _65through128: - MOVOU X0, (DI) - MOVOU X0, 16(DI) - MOVOU X0, 32(DI) - MOVOU X0, 48(DI) - MOVOU X0, -64(DI)(BX*1) - MOVOU X0, -48(DI)(BX*1) - MOVOU X0, -32(DI)(BX*1) - MOVOU X0, -16(DI)(BX*1) + MOVOU X15, (DI) + MOVOU X15, 16(DI) + MOVOU X15, 32(DI) + MOVOU X15, 48(DI) + MOVOU X15, -64(DI)(BX*1) + MOVOU X15, -48(DI)(BX*1) + MOVOU X15, -32(DI)(BX*1) + MOVOU X15, -16(DI)(BX*1) RET _129through256: - MOVOU X0, (DI) - MOVOU X0, 16(DI) - MOVOU X0, 32(DI) - MOVOU X0, 48(DI) - MOVOU X0, 64(DI) - MOVOU X0, 80(DI) - MOVOU X0, 96(DI) - MOVOU X0, 112(DI) - MOVOU X0, -128(DI)(BX*1) - MOVOU X0, -112(DI)(BX*1) - MOVOU X0, -96(DI)(BX*1) - MOVOU X0, -80(DI)(BX*1) - MOVOU X0, -64(DI)(BX*1) - MOVOU X0, -48(DI)(BX*1) - MOVOU X0, -32(DI)(BX*1) - MOVOU X0, -16(DI)(BX*1) + MOVOU X15, (DI) + MOVOU X15, 16(DI) + MOVOU X15, 32(DI) + MOVOU X15, 48(DI) + MOVOU X15, 64(DI) + MOVOU X15, 80(DI) + MOVOU X15, 96(DI) + MOVOU X15, 112(DI) + MOVOU X15, -128(DI)(BX*1) + MOVOU X15, -112(DI)(BX*1) + MOVOU X15, -96(DI)(BX*1) + MOVOU X15, -80(DI)(BX*1) + MOVOU X15, -64(DI)(BX*1) + MOVOU X15, -48(DI)(BX*1) + MOVOU X15, -32(DI)(BX*1) + MOVOU X15, -16(DI)(BX*1) RET diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s index d91641a8e8..f1e3403596 100644 --- a/src/runtime/memmove_amd64.s +++ b/src/runtime/memmove_amd64.s @@ -31,11 +31,20 @@ // See memmove Go doc for important implementation constraints. // func memmove(to, from unsafe.Pointer, n uintptr) -TEXT runtime·memmove(SB), NOSPLIT, $0-24 - +// ABIInternal for performance. +TEXT runtime·memmove(SB), NOSPLIT, $0-24 +#ifdef GOEXPERIMENT_regabiargs + // AX = to + // BX = from + // CX = n + MOVQ AX, DI + MOVQ BX, SI + MOVQ CX, BX +#else MOVQ to+0(FP), DI MOVQ from+8(FP), SI MOVQ n+16(FP), BX +#endif // REP instructions have a high startup cost, so we handle small sizes // with some straightline code. The REP MOVSQ instruction is really fast @@ -244,6 +253,10 @@ move_129through256: MOVOU X13, -48(DI)(BX*1) MOVOU X14, -32(DI)(BX*1) MOVOU X15, -16(DI)(BX*1) +#ifdef GOEXPERIMENT_regabig + // X15 must be zero on return + PXOR X15, X15 +#endif RET move_256through2048: SUBQ $256, BX @@ -283,6 +296,10 @@ move_256through2048: LEAQ 256(SI), SI LEAQ 256(DI), DI JGE move_256through2048 +#ifdef GOEXPERIMENT_regabig + // X15 must be zero on return + PXOR X15, X15 +#endif JMP tail avxUnaligned: diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go index f635d942e4..16d7583202 100644 --- a/src/runtime/stubs.go +++ b/src/runtime/stubs.go @@ -109,6 +109,9 @@ func reflect_memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) { //go:noescape func memmove(to, from unsafe.Pointer, n uintptr) +// Outside assembly calls memmove. Make sure it has ABI wrappers. +//go:linkname memmove + //go:linkname reflect_memmove reflect.memmove func reflect_memmove(to, from unsafe.Pointer, n uintptr) { memmove(to, from, n)