From: Balaram Makam Date: Wed, 13 Dec 2017 16:51:22 +0000 (-0500) Subject: runtime: improve arm64 memmove implementation X-Git-Tag: go1.11beta1~1385 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=213a75171df1c1fb0ae76fe7c8ff877b1fa9c0b9;p=gostls13.git runtime: improve arm64 memmove implementation Improve runtime memmove_arm64.s specializing for small copies and processing 32 bytes per iteration for 32 bytes or more. Benchmark results of runtime/Memmove on Amberwing: name old time/op new time/op delta Memmove/0 7.61ns ± 0% 7.20ns ± 0% ~ (p=0.053 n=5+7) Memmove/1 9.28ns ± 0% 8.80ns ± 0% -5.17% (p=0.000 n=4+8) Memmove/2 9.65ns ± 0% 9.20ns ± 0% -4.68% (p=0.000 n=5+8) Memmove/3 10.0ns ± 0% 9.2ns ± 0% -7.83% (p=0.000 n=5+8) Memmove/4 10.6ns ± 0% 9.2ns ± 0% -13.21% (p=0.000 n=5+8) Memmove/5 11.0ns ± 0% 9.2ns ± 0% -16.36% (p=0.000 n=5+8) Memmove/6 12.4ns ± 0% 9.2ns ± 0% -25.81% (p=0.000 n=5+8) Memmove/7 13.1ns ± 0% 9.2ns ± 0% -29.56% (p=0.000 n=5+8) Memmove/8 9.10ns ± 1% 9.20ns ± 0% +1.08% (p=0.002 n=5+8) Memmove/9 9.67ns ± 0% 9.20ns ± 0% -4.88% (p=0.000 n=5+8) Memmove/10 10.4ns ± 0% 9.2ns ± 0% -11.54% (p=0.000 n=5+8) Memmove/11 10.9ns ± 0% 9.2ns ± 0% -15.60% (p=0.000 n=5+8) Memmove/12 11.5ns ± 0% 9.2ns ± 0% -20.00% (p=0.000 n=5+8) Memmove/13 12.4ns ± 0% 9.2ns ± 0% -25.81% (p=0.000 n=5+8) Memmove/14 13.1ns ± 0% 9.2ns ± 0% -29.77% (p=0.000 n=5+8) Memmove/15 13.8ns ± 0% 9.2ns ± 0% -33.33% (p=0.000 n=5+8) Memmove/16 9.70ns ± 0% 9.20ns ± 0% -5.19% (p=0.000 n=5+8) Memmove/32 10.6ns ± 0% 9.2ns ± 0% -13.21% (p=0.000 n=4+8) Memmove/64 13.4ns ± 0% 10.2ns ± 0% -23.88% (p=0.000 n=4+8) Memmove/128 18.1ns ± 1% 13.2ns ± 0% -26.99% (p=0.000 n=5+8) Memmove/256 25.2ns ± 0% 16.4ns ± 0% -34.92% (p=0.000 n=5+8) Memmove/512 36.4ns ± 0% 22.8ns ± 0% -37.36% (p=0.000 n=5+8) Memmove/1024 70.1ns ± 0% 36.8ns ±11% -47.49% (p=0.002 n=5+8) Memmove/2048 121ns ± 0% 61ns ± 0% ~ (p=0.053 n=5+7) Memmove/4096 224ns ± 0% 120ns ± 0% -46.43% (p=0.000 n=5+8) MemmoveUnalignedDst/0 8.40ns ± 0% 8.00ns ± 0% -4.76% (p=0.000 n=5+8) MemmoveUnalignedDst/1 9.87ns ± 1% 10.00ns ± 0% ~ (p=0.070 n=5+8) MemmoveUnalignedDst/2 10.6ns ± 0% 10.4ns ± 0% -1.89% (p=0.000 n=5+8) MemmoveUnalignedDst/3 10.8ns ± 0% 10.4ns ± 0% -3.70% (p=0.000 n=5+8) MemmoveUnalignedDst/4 10.9ns ± 0% 10.3ns ± 0% ~ (p=0.053 n=5+7) MemmoveUnalignedDst/5 11.5ns ± 0% 10.3ns ± 1% -10.22% (p=0.000 n=4+8) MemmoveUnalignedDst/6 13.2ns ± 0% 10.4ns ± 1% -21.50% (p=0.000 n=5+8) MemmoveUnalignedDst/7 13.7ns ± 0% 10.3ns ± 1% -24.64% (p=0.000 n=4+8) MemmoveUnalignedDst/8 10.1ns ± 0% 10.4ns ± 0% +2.97% (p=0.002 n=5+8) MemmoveUnalignedDst/9 10.7ns ± 0% 10.4ns ± 0% -2.80% (p=0.000 n=5+8) MemmoveUnalignedDst/10 11.2ns ± 1% 10.4ns ± 0% -6.81% (p=0.000 n=5+8) MemmoveUnalignedDst/11 11.6ns ± 0% 10.4ns ± 0% -10.34% (p=0.000 n=5+8) MemmoveUnalignedDst/12 12.5ns ± 2% 10.4ns ± 0% -16.53% (p=0.000 n=5+8) MemmoveUnalignedDst/13 13.7ns ± 0% 10.4ns ± 0% -24.09% (p=0.000 n=5+8) MemmoveUnalignedDst/14 14.0ns ± 0% 10.4ns ± 0% -25.71% (p=0.000 n=5+8) MemmoveUnalignedDst/15 14.6ns ± 0% 10.4ns ± 0% -28.77% (p=0.000 n=5+8) MemmoveUnalignedDst/16 10.5ns ± 0% 10.4ns ± 0% -0.95% (p=0.000 n=5+8) MemmoveUnalignedDst/32 12.4ns ± 0% 11.6ns ± 0% -6.05% (p=0.000 n=5+8) MemmoveUnalignedDst/64 15.2ns ± 0% 12.3ns ± 0% -19.08% (p=0.000 n=5+8) MemmoveUnalignedDst/128 18.7ns ± 0% 15.2ns ± 0% -18.72% (p=0.000 n=5+8) MemmoveUnalignedDst/256 25.1ns ± 0% 18.6ns ± 0% -25.90% (p=0.000 n=5+8) MemmoveUnalignedDst/512 37.8ns ± 0% 24.4ns ± 0% -35.45% (p=0.000 n=5+8) MemmoveUnalignedDst/1024 74.6ns ± 0% 40.4ns ± 0% ~ (p=0.053 n=5+7) MemmoveUnalignedDst/2048 133ns ± 0% 75ns ± 0% -43.91% (p=0.000 n=5+8) MemmoveUnalignedDst/4096 247ns ± 0% 141ns ± 0% -42.91% (p=0.000 n=5+8) MemmoveUnalignedSrc/0 8.40ns ± 0% 8.00ns ± 0% -4.76% (p=0.000 n=5+8) MemmoveUnalignedSrc/1 9.81ns ± 0% 10.00ns ± 0% +1.98% (p=0.002 n=5+8) MemmoveUnalignedSrc/2 10.5ns ± 0% 10.0ns ± 0% -4.76% (p=0.000 n=5+8) MemmoveUnalignedSrc/3 10.7ns ± 1% 10.0ns ± 0% -6.89% (p=0.000 n=5+8) MemmoveUnalignedSrc/4 11.3ns ± 0% 10.0ns ± 0% -11.50% (p=0.000 n=5+8) MemmoveUnalignedSrc/5 11.6ns ± 0% 10.0ns ± 0% -13.79% (p=0.000 n=5+8) MemmoveUnalignedSrc/6 13.6ns ± 0% 10.0ns ± 0% -26.47% (p=0.000 n=5+8) MemmoveUnalignedSrc/7 14.4ns ± 0% 10.0ns ± 0% -30.75% (p=0.000 n=5+8) MemmoveUnalignedSrc/8 9.87ns ± 1% 10.00ns ± 0% ~ (p=0.070 n=5+8) MemmoveUnalignedSrc/9 10.4ns ± 0% 10.0ns ± 0% -3.85% (p=0.000 n=5+8) MemmoveUnalignedSrc/10 11.2ns ± 0% 10.0ns ± 0% -10.71% (p=0.000 n=5+8) MemmoveUnalignedSrc/11 11.8ns ± 0% 10.0ns ± 0% -15.25% (p=0.000 n=5+8) MemmoveUnalignedSrc/12 12.1ns ± 0% 10.0ns ± 0% -17.36% (p=0.000 n=5+8) MemmoveUnalignedSrc/13 13.6ns ± 0% 10.0ns ± 0% -26.47% (p=0.000 n=5+8) MemmoveUnalignedSrc/14 14.7ns ± 0% 10.0ns ± 0% -31.79% (p=0.000 n=5+8) MemmoveUnalignedSrc/15 14.4ns ± 0% 10.0ns ± 0% -30.56% (p=0.000 n=5+8) MemmoveUnalignedSrc/16 11.0ns ± 0% 10.0ns ± 0% -9.09% (p=0.000 n=5+8) MemmoveUnalignedSrc/32 11.5ns ± 0% 10.0ns ± 0% -13.04% (p=0.000 n=5+8) MemmoveUnalignedSrc/64 14.9ns ± 0% 11.2ns ± 0% -24.83% (p=0.000 n=4+8) MemmoveUnalignedSrc/128 19.5ns ± 0% 15.2ns ± 0% -22.05% (p=0.000 n=5+8) MemmoveUnalignedSrc/256 27.3ns ± 2% 19.2ns ± 0% -29.62% (p=0.000 n=5+8) MemmoveUnalignedSrc/512 40.4ns ± 0% 27.2ns ± 0% -32.67% (p=0.000 n=5+8) MemmoveUnalignedSrc/1024 75.4ns ± 0% 44.4ns ± 0% -41.15% (p=0.000 n=5+8) MemmoveUnalignedSrc/2048 131ns ± 0% 77ns ± 3% -41.56% (p=0.002 n=5+8) MemmoveUnalignedSrc/4096 248ns ± 0% 145ns ± 0% -41.53% (p=0.000 n=5+8) name old speed new speed delta Memmove/1 108MB/s ± 0% 114MB/s ± 0% +5.37% (p=0.004 n=4+8) Memmove/2 207MB/s ± 0% 217MB/s ± 0% +4.85% (p=0.002 n=5+8) Memmove/3 301MB/s ± 0% 326MB/s ± 0% +8.45% (p=0.002 n=5+8) Memmove/4 377MB/s ± 0% 435MB/s ± 0% +15.31% (p=0.004 n=4+8) Memmove/5 455MB/s ± 0% 543MB/s ± 0% +19.46% (p=0.002 n=5+8) Memmove/6 483MB/s ± 0% 652MB/s ± 0% +34.88% (p=0.003 n=5+7) Memmove/7 537MB/s ± 0% 761MB/s ± 0% +41.71% (p=0.002 n=5+8) Memmove/8 879MB/s ± 1% 869MB/s ± 0% -1.15% (p=0.000 n=5+7) Memmove/9 931MB/s ± 0% 978MB/s ± 0% +5.05% (p=0.002 n=5+8) Memmove/10 960MB/s ± 0% 1086MB/s ± 0% +13.13% (p=0.002 n=5+8) Memmove/11 1.00GB/s ± 0% 1.20GB/s ± 0% +18.92% (p=0.003 n=5+7) Memmove/12 1.04GB/s ± 0% 1.30GB/s ± 0% +25.40% (p=0.002 n=5+8) Memmove/13 1.05GB/s ± 0% 1.41GB/s ± 0% +34.87% (p=0.002 n=5+8) Memmove/14 1.07GB/s ± 0% 1.52GB/s ± 0% +42.14% (p=0.002 n=5+8) Memmove/15 1.09GB/s ± 0% 1.63GB/s ± 0% +49.91% (p=0.002 n=5+8) Memmove/16 1.65GB/s ± 0% 1.74GB/s ± 0% +5.40% (p=0.003 n=5+7) Memmove/32 3.01GB/s ± 0% 3.48GB/s ± 0% +15.58% (p=0.003 n=5+7) Memmove/64 4.76GB/s ± 0% 6.27GB/s ± 0% +31.75% (p=0.003 n=5+7) Memmove/128 7.08GB/s ± 1% 9.69GB/s ± 0% +36.96% (p=0.002 n=5+8) Memmove/256 10.2GB/s ± 0% 15.6GB/s ± 0% +53.58% (p=0.002 n=5+8) Memmove/512 14.1GB/s ± 0% 22.4GB/s ± 0% +59.57% (p=0.003 n=5+7) Memmove/1024 14.6GB/s ± 0% 27.9GB/s ±10% +91.00% (p=0.002 n=5+8) Memmove/2048 16.9GB/s ± 0% 33.4GB/s ± 0% +98.32% (p=0.003 n=5+7) Memmove/4096 18.3GB/s ± 0% 33.9GB/s ± 0% +85.80% (p=0.002 n=5+8) MemmoveUnalignedDst/1 101MB/s ± 1% 100MB/s ± 0% ~ (p=0.586 n=5+8) MemmoveUnalignedDst/2 189MB/s ± 0% 192MB/s ± 0% +1.82% (p=0.002 n=5+8) MemmoveUnalignedDst/3 278MB/s ± 0% 288MB/s ± 0% +3.88% (p=0.003 n=5+7) MemmoveUnalignedDst/4 368MB/s ± 0% 387MB/s ± 0% +5.41% (p=0.003 n=5+7) MemmoveUnalignedDst/5 434MB/s ± 0% 484MB/s ± 0% +11.52% (p=0.002 n=5+8) MemmoveUnalignedDst/6 454MB/s ± 0% 580MB/s ± 0% +27.62% (p=0.002 n=5+8) MemmoveUnalignedDst/7 509MB/s ± 0% 677MB/s ± 0% +33.01% (p=0.002 n=5+8) MemmoveUnalignedDst/8 792MB/s ± 0% 770MB/s ± 0% -2.77% (p=0.002 n=5+8) MemmoveUnalignedDst/9 841MB/s ± 0% 866MB/s ± 0% +2.92% (p=0.002 n=5+8) MemmoveUnalignedDst/10 896MB/s ± 0% 962MB/s ± 0% +7.35% (p=0.003 n=5+7) MemmoveUnalignedDst/11 947MB/s ± 0% 1058MB/s ± 0% +11.80% (p=0.002 n=5+8) MemmoveUnalignedDst/12 962MB/s ± 2% 1154MB/s ± 0% +19.97% (p=0.002 n=5+8) MemmoveUnalignedDst/13 947MB/s ± 0% 1251MB/s ± 0% +32.08% (p=0.002 n=5+8) MemmoveUnalignedDst/14 1.00GB/s ± 0% 1.35GB/s ± 0% +34.55% (p=0.002 n=5+8) MemmoveUnalignedDst/15 1.03GB/s ± 0% 1.44GB/s ± 0% +40.50% (p=0.002 n=5+8) MemmoveUnalignedDst/16 1.53GB/s ± 0% 1.54GB/s ± 0% +0.77% (p=0.002 n=5+8) MemmoveUnalignedDst/32 2.58GB/s ± 0% 2.75GB/s ± 0% +6.52% (p=0.003 n=5+7) MemmoveUnalignedDst/64 4.21GB/s ± 0% 5.19GB/s ± 0% +23.40% (p=0.004 n=5+6) MemmoveUnalignedDst/128 6.86GB/s ± 0% 8.42GB/s ± 0% +22.78% (p=0.003 n=5+7) MemmoveUnalignedDst/256 10.2GB/s ± 0% 13.8GB/s ± 0% +35.15% (p=0.002 n=5+8) MemmoveUnalignedDst/512 13.5GB/s ± 0% 21.0GB/s ± 0% +54.90% (p=0.002 n=5+8) MemmoveUnalignedDst/1024 13.7GB/s ± 0% 25.3GB/s ± 0% +84.61% (p=0.003 n=5+7) MemmoveUnalignedDst/2048 15.3GB/s ± 0% 27.5GB/s ± 0% +79.52% (p=0.002 n=5+8) MemmoveUnalignedDst/4096 16.5GB/s ± 0% 28.9GB/s ± 0% +74.74% (p=0.002 n=5+8) MemmoveUnalignedSrc/1 102MB/s ± 0% 100MB/s ± 0% -2.02% (p=0.000 n=5+7) MemmoveUnalignedSrc/2 191MB/s ± 0% 200MB/s ± 0% +4.78% (p=0.002 n=5+8) MemmoveUnalignedSrc/3 279MB/s ± 0% 300MB/s ± 0% +7.45% (p=0.002 n=5+8) MemmoveUnalignedSrc/4 354MB/s ± 0% 400MB/s ± 0% +13.10% (p=0.002 n=5+8) MemmoveUnalignedSrc/5 431MB/s ± 0% 500MB/s ± 0% +16.02% (p=0.002 n=5+8) MemmoveUnalignedSrc/6 441MB/s ± 0% 600MB/s ± 0% +36.03% (p=0.002 n=5+8) MemmoveUnalignedSrc/7 485MB/s ± 0% 700MB/s ± 0% +44.29% (p=0.002 n=5+8) MemmoveUnalignedSrc/8 811MB/s ± 1% 800MB/s ± 0% -1.36% (p=0.016 n=5+8) MemmoveUnalignedSrc/9 864MB/s ± 0% 900MB/s ± 0% +4.07% (p=0.002 n=5+8) MemmoveUnalignedSrc/10 893MB/s ± 0% 999MB/s ± 0% +11.97% (p=0.002 n=5+8) MemmoveUnalignedSrc/11 932MB/s ± 0% 1099MB/s ± 0% +18.01% (p=0.002 n=5+8) MemmoveUnalignedSrc/12 988MB/s ± 0% 1199MB/s ± 0% +21.35% (p=0.002 n=5+8) MemmoveUnalignedSrc/13 955MB/s ± 0% 1299MB/s ± 0% +36.02% (p=0.002 n=5+8) MemmoveUnalignedSrc/14 955MB/s ± 0% 1399MB/s ± 0% +46.52% (p=0.002 n=5+8) MemmoveUnalignedSrc/15 1.04GB/s ± 0% 1.50GB/s ± 0% +44.18% (p=0.002 n=5+8) MemmoveUnalignedSrc/16 1.45GB/s ± 0% 1.60GB/s ± 0% +10.14% (p=0.002 n=5+8) MemmoveUnalignedSrc/32 2.78GB/s ± 0% 3.20GB/s ± 0% +15.16% (p=0.003 n=5+7) MemmoveUnalignedSrc/64 4.30GB/s ± 0% 5.72GB/s ± 0% +32.90% (p=0.003 n=5+7) MemmoveUnalignedSrc/128 6.57GB/s ± 0% 8.42GB/s ± 0% +28.06% (p=0.002 n=5+8) MemmoveUnalignedSrc/256 9.39GB/s ± 1% 13.33GB/s ± 0% +41.96% (p=0.002 n=5+8) MemmoveUnalignedSrc/512 12.7GB/s ± 0% 18.8GB/s ± 0% +48.53% (p=0.003 n=5+7) MemmoveUnalignedSrc/1024 13.6GB/s ± 0% 23.0GB/s ± 0% +69.82% (p=0.002 n=5+8) MemmoveUnalignedSrc/2048 15.6GB/s ± 0% 26.8GB/s ± 3% +71.37% (p=0.002 n=5+8) MemmoveUnalignedSrc/4096 16.5GB/s ± 0% 28.2GB/s ± 0% +71.40% (p=0.002 n=5+8) Fixes #22925 Change-Id: I38c1a9ad5c6e3f4f95fc521c4b7e3140b58b4737 Reviewed-on: https://go-review.googlesource.com/83799 Run-TryBot: Cherry Zhang Reviewed-by: Cherry Zhang --- diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s index 42bdffc269..2faad8df93 100644 --- a/src/runtime/memmove_arm64.s +++ b/src/runtime/memmove_arm64.s @@ -9,16 +9,15 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 MOVD to+0(FP), R3 MOVD from+8(FP), R4 MOVD n+16(FP), R5 - CMP $0, R5 - BNE check + CBNZ R5, check RET check: - AND $~7, R5, R7 // R7 is N&~7 - // TODO(mwhudson): this is written this way to avoid tickling - // warnings from addpool when written as AND $7, R5, R6 (see - // https://golang.org/issue/12708) - SUB R7, R5, R6 // R6 is N&7 + CMP $16, R5 + BLE copy16 + + AND $~31, R5, R7 // R7 is N&~31 + SUB R7, R5, R6 // R6 is N&31 CMP R3, R4 BLT backward @@ -31,20 +30,20 @@ check: // optimization, but the on the one tested so far (xgene) it did not // make a significance difference.) - CMP $0, R7 // Do we need to do any word-by-word copying? - BEQ noforwardlarge + CBZ R7, noforwardlarge // Do we need to do any doubleword-by-doubleword copying? ADD R3, R7, R9 // R9 points just past where we copy by word forwardlargeloop: - MOVD.P 8(R4), R8 // R8 is just a scratch register - MOVD.P R8, 8(R3) - CMP R3, R9 - BNE forwardlargeloop + LDP.P 32(R4), (R8, R10) + STP.P (R8, R10), 32(R3) + LDP -16(R4), (R11, R12) + STP (R11, R12), -16(R3) + SUB $32, R7, R7 + CBNZ R7, forwardlargeloop noforwardlarge: - CMP $0, R6 // Do we need to do any byte-by-byte copying? - BNE forwardtail + CBNZ R6, forwardtail // Do we need to do any byte-by-byte copying? RET forwardtail: @@ -57,6 +56,39 @@ forwardtailloop: BNE forwardtailloop RET + // Small copies: 1..16 bytes. +copy16: + ADD R4, R5, R8 // R8 points just past the last source byte + ADD R3, R5, R9 // R9 points just past the last destination byte + CMP $8, R5 + BLT copy7 + MOVD (R4), R6 + MOVD -8(R8), R7 + MOVD R6, (R3) + MOVD R7, -8(R9) + RET + +copy7: + TBZ $2, R5, copy3 + MOVWU (R4), R6 + MOVWU -4(R8), R7 + MOVW R6, (R3) + MOVW R7, -4(R9) + RET + +copy3: + TBZ $1, R5, copy1 + MOVHU (R4), R6 + MOVHU -2(R8), R7 + MOVH R6, (R3) + MOVH R7, -2(R9) + RET + +copy1: + MOVBU (R4), R6 + MOVB R6, (R3) + RET + backward: // Copying backwards proceeds by copying R6 bytes then copying R7/8 words. // R3 and R4 are advanced to the end of the destination/source buffers @@ -65,8 +97,7 @@ backward: ADD R4, R5, R4 // R4 points just past the last source byte ADD R3, R5, R3 // R3 points just past the last destination byte - CMP $0, R6 // Do we need to do any byte-by-byte copying? - BEQ nobackwardtail + CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying? SUB R6, R3, R9 // R9 points at the lowest destination byte that should be copied by byte. backwardtailloop: @@ -76,16 +107,17 @@ backwardtailloop: BNE backwardtailloop nobackwardtail: - CMP $0, R7 // Do we need to do any word-by-word copying? - BNE backwardlarge + CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying? RET backwardlarge: SUB R7, R3, R9 // R9 points at the lowest destination byte backwardlargeloop: - MOVD.W -8(R4), R8 - MOVD.W R8, -8(R3) + LDP -16(R4), (R8, R10) + STP (R8, R10), -16(R3) + LDP.W -32(R4), (R11, R12) + STP.W (R11, R12), -32(R3) CMP R9, R3 BNE backwardlargeloop RET