From: Klaus Post Date: Wed, 6 Oct 2021 16:11:40 +0000 (+0000) Subject: crypto/md5: optimize amd64 assembly X-Git-Tag: go1.22rc1~1417 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=55d08e5010c3169e648a66433322530ca5054e9e;p=gostls13.git crypto/md5: optimize amd64 assembly * Use two ADDL instead of LEAL * Keep ones in R11 * Use XORL with lower latency instead of NOTL * Remove loads and load the correct value in the previous round * Reduce dependency chain in round 2. * Remove MOVL in round 3. name old time/op new time/op delta Hash8Bytes-32 104ns ± 0% 96ns ± 1% -7.83% (p=0.000 n=9+10) Hash64-32 169ns ± 0% 155ns ± 0% -7.97% (p=0.000 n=10+10) Hash128-32 244ns ± 0% 224ns ± 0% -8.16% (p=0.000 n=9+10) Hash256-32 396ns ± 0% 360ns ± 1% -9.01% (p=0.000 n=10+10) Hash512-32 700ns ± 1% 634ns ± 1% -9.43% (p=0.000 n=10+10) Hash1K-32 1.30µs ± 0% 1.18µs ± 1% -9.32% (p=0.000 n=9+10) Hash8K-32 9.77µs ± 0% 8.81µs ± 0% -9.78% (p=0.000 n=9+10) Hash1M-32 1.24ms ± 1% 1.12ms ± 1% -9.54% (p=0.000 n=10+10) Hash8M-32 10.0ms ± 1% 9.0ms ± 1% -10.04% (p=0.000 n=10+10) Hash8BytesUnaligned-32 104ns ± 0% 96ns ± 0% -7.50% (p=0.000 n=10+10) Hash1KUnaligned-32 1.32µs ± 1% 1.18µs ± 1% -10.42% (p=0.000 n=10+10) Hash8KUnaligned-32 9.80µs ± 0% 8.79µs ± 1% -10.29% (p=0.000 n=10+10) name old speed new speed delta Hash8Bytes-32 77.1MB/s ± 0% 83.6MB/s ± 1% +8.49% (p=0.000 n=9+10) Hash64-32 379MB/s ± 0% 412MB/s ± 0% +8.66% (p=0.000 n=10+10) Hash128-32 525MB/s ± 0% 572MB/s ± 0% +8.89% (p=0.000 n=9+10) Hash256-32 646MB/s ± 0% 710MB/s ± 1% +9.90% (p=0.000 n=10+10) Hash512-32 732MB/s ± 1% 808MB/s ± 1% +10.41% (p=0.000 n=10+10) Hash1K-32 786MB/s ± 0% 866MB/s ± 1% +10.30% (p=0.000 n=9+10) Hash8K-32 839MB/s ± 0% 930MB/s ± 0% +10.79% (p=0.000 n=10+10) Hash1M-32 849MB/s ± 1% 938MB/s ± 1% +10.54% (p=0.000 n=10+10) Hash8M-32 841MB/s ± 1% 935MB/s ± 1% +11.16% (p=0.000 n=10+10) Hash8BytesUnaligned-32 77.1MB/s ± 0% 83.4MB/s ± 0% +8.12% (p=0.000 n=10+10) Hash1KUnaligned-32 778MB/s ± 1% 869MB/s ± 1% +11.64% (p=0.000 n=10+10) Hash8KUnaligned-32 836MB/s ± 0% 932MB/s ± 1% +11.47% (p=0.000 n=10+10) Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993 This PR will be imported into Gerrit with the title and first comment (this text) used to generate the subject and body of the Gerrit change. Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993 GitHub-Last-Rev: ec8b15d789181d0dac57bf0ba5041ee7aeb305c9 GitHub-Pull-Request: golang/go#43690 Reviewed-on: https://go-review.googlesource.com/c/go/+/283538 Run-TryBot: Joel Sing Reviewed-by: Matthew Dempsky Reviewed-by: David Chase TryBot-Result: Gopher Robot Reviewed-by: Joel Sing --- diff --git a/src/crypto/md5/md5block_amd64.s b/src/crypto/md5/md5block_amd64.s index 7c7d92d7e8..75c8074b37 100644 --- a/src/crypto/md5/md5block_amd64.s +++ b/src/crypto/md5/md5block_amd64.s @@ -25,6 +25,7 @@ TEXT ·block(SB),NOSPLIT,$8-32 MOVL (1*4)(BP), BX MOVL (2*4)(BP), CX MOVL (3*4)(BP), DX + MOVL $0xffffffff, R11 CMPQ SI, DI JEQ end @@ -40,14 +41,15 @@ loop: #define ROUND1(a, b, c, d, index, const, shift) \ XORL c, R9; \ - LEAL const(a)(R8*1), a; \ + ADDL $const, a; \ + ADDL R8, a; \ ANDL b, R9; \ - XORL d, R9; \ - MOVL (index*4)(SI), R8; \ - ADDL R9, a; \ - ROLL $shift, a; \ - MOVL c, R9; \ - ADDL b, a + XORL d, R9; \ + MOVL (index*4)(SI), R8; \ + ADDL R9, a; \ + ROLL $shift, a; \ + MOVL c, R9; \ + ADDL b, a ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7); ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12); @@ -64,21 +66,23 @@ loop: ROUND1(AX,BX,CX,DX,13,0x6b901122, 7); ROUND1(DX,AX,BX,CX,14,0xfd987193,12); ROUND1(CX,DX,AX,BX,15,0xa679438e,17); - ROUND1(BX,CX,DX,AX, 0,0x49b40821,22); + ROUND1(BX,CX,DX,AX, 1,0x49b40821,22); - MOVL (1*4)(SI), R8 MOVL DX, R9 MOVL DX, R10 +// Uses https://github.com/animetosho/md5-optimisation#dependency-shortcut-in-g-function + #define ROUND2(a, b, c, d, index, const, shift) \ - NOTL R9; \ - LEAL const(a)(R8*1),a; \ + XORL R11, R9; \ + ADDL $const, a; \ + ADDL R8, a; \ ANDL b, R10; \ ANDL c, R9; \ MOVL (index*4)(SI),R8; \ - ORL R9, R10; \ + ADDL R9, a; \ + ADDL R10, a; \ MOVL c, R9; \ - ADDL R10, a; \ MOVL c, R10; \ ROLL $shift, a; \ ADDL b, a @@ -98,22 +102,34 @@ loop: ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5); ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9); ROUND2(CX,DX,AX,BX,12,0x676f02d9,14); - ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20); + ROUND2(BX,CX,DX,AX, 5,0x8d2a4c8a,20); - MOVL (5*4)(SI), R8 MOVL CX, R9 -#define ROUND3(a, b, c, d, index, const, shift) \ - LEAL const(a)(R8*1),a; \ +// Uses https://github.com/animetosho/md5-optimisation#h-function-re-use + +#define ROUND3FIRST(a, b, c, d, index, const, shift) \ + MOVL d, R9; \ + XORL c, R9; \ + XORL b, R9; \ + ADDL $const, a; \ + ADDL R8, a; \ MOVL (index*4)(SI),R8; \ - XORL d, R9; \ + ADDL R9, a; \ + ROLL $shift, a; \ + ADDL b, a + +#define ROUND3(a, b, c, d, index, const, shift) \ + XORL a, R9; \ XORL b, R9; \ + ADDL $const, a; \ + ADDL R8, a; \ + MOVL (index*4)(SI),R8; \ ADDL R9, a; \ ROLL $shift, a; \ - MOVL b, R9; \ ADDL b, a - ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4); + ROUND3FIRST(AX,BX,CX,DX, 8,0xfffa3942, 4); ROUND3(DX,AX,BX,CX,11,0x8771f681,11); ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16); ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23); @@ -130,13 +146,13 @@ loop: ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16); ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23); - MOVL (0*4)(SI), R8 - MOVL $0xffffffff, R9 + MOVL R11, R9 XORL DX, R9 #define ROUND4(a, b, c, d, index, const, shift) \ - LEAL const(a)(R8*1),a; \ - ORL b, R9; \ + ADDL $const, a; \ + ADDL R8, a; \ + ORL b, R9; \ XORL c, R9; \ ADDL R9, a; \ MOVL (index*4)(SI),R8; \