]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/md5: optimize amd64 assembly
authorKlaus Post <klauspost@gmail.com>
Wed, 6 Oct 2021 16:11:40 +0000 (16:11 +0000)
committerJoel Sing <joel@sing.id.au>
Fri, 4 Aug 2023 16:02:36 +0000 (16:02 +0000)
* Use two ADDL instead of LEAL
* Keep ones in R11
* Use XORL with lower latency instead of NOTL
* Remove loads and load the correct value in the previous round
* Reduce dependency chain in round 2.
* Remove MOVL in round 3.

name                    old time/op    new time/op    delta
Hash8Bytes-32              104ns ± 0%      96ns ± 1%   -7.83%   (p=0.000 n=9+10)
Hash64-32                  169ns ± 0%     155ns ± 0%   -7.97%  (p=0.000 n=10+10)
Hash128-32                 244ns ± 0%     224ns ± 0%   -8.16%   (p=0.000 n=9+10)
Hash256-32                 396ns ± 0%     360ns ± 1%   -9.01%  (p=0.000 n=10+10)
Hash512-32                 700ns ± 1%     634ns ± 1%   -9.43%  (p=0.000 n=10+10)
Hash1K-32                 1.30µs ± 0%    1.18µs ± 1%   -9.32%   (p=0.000 n=9+10)
Hash8K-32                 9.77µs ± 0%    8.81µs ± 0%   -9.78%   (p=0.000 n=9+10)
Hash1M-32                 1.24ms ± 1%    1.12ms ± 1%   -9.54%  (p=0.000 n=10+10)
Hash8M-32                 10.0ms ± 1%     9.0ms ± 1%  -10.04%  (p=0.000 n=10+10)
Hash8BytesUnaligned-32     104ns ± 0%      96ns ± 0%   -7.50%  (p=0.000 n=10+10)
Hash1KUnaligned-32        1.32µs ± 1%    1.18µs ± 1%  -10.42%  (p=0.000 n=10+10)
Hash8KUnaligned-32        9.80µs ± 0%    8.79µs ± 1%  -10.29%  (p=0.000 n=10+10)

name                    old speed      new speed      delta
Hash8Bytes-32           77.1MB/s ± 0%  83.6MB/s ± 1%   +8.49%   (p=0.000 n=9+10)
Hash64-32                379MB/s ± 0%   412MB/s ± 0%   +8.66%  (p=0.000 n=10+10)
Hash128-32               525MB/s ± 0%   572MB/s ± 0%   +8.89%   (p=0.000 n=9+10)
Hash256-32               646MB/s ± 0%   710MB/s ± 1%   +9.90%  (p=0.000 n=10+10)
Hash512-32               732MB/s ± 1%   808MB/s ± 1%  +10.41%  (p=0.000 n=10+10)
Hash1K-32                786MB/s ± 0%   866MB/s ± 1%  +10.30%   (p=0.000 n=9+10)
Hash8K-32                839MB/s ± 0%   930MB/s ± 0%  +10.79%  (p=0.000 n=10+10)
Hash1M-32                849MB/s ± 1%   938MB/s ± 1%  +10.54%  (p=0.000 n=10+10)
Hash8M-32                841MB/s ± 1%   935MB/s ± 1%  +11.16%  (p=0.000 n=10+10)
Hash8BytesUnaligned-32  77.1MB/s ± 0%  83.4MB/s ± 0%   +8.12%  (p=0.000 n=10+10)
Hash1KUnaligned-32       778MB/s ± 1%   869MB/s ± 1%  +11.64%  (p=0.000 n=10+10)
Hash8KUnaligned-32       836MB/s ± 0%   932MB/s ± 1%  +11.47%  (p=0.000 n=10+10)

Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993

This PR will be imported into Gerrit with the title and first
comment (this text) used to generate the subject and body of
the Gerrit change.

Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993
GitHub-Last-Rev: ec8b15d789181d0dac57bf0ba5041ee7aeb305c9
GitHub-Pull-Request: golang/go#43690
Reviewed-on: https://go-review.googlesource.com/c/go/+/283538
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
Reviewed-by: David Chase <drchase@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
src/crypto/md5/md5block_amd64.s

index 7c7d92d7e806dd399a0fd47be32aee1d690ed36e..75c8074b37ebb4679339f1b9190bf4f12f26fb81 100644 (file)
@@ -25,6 +25,7 @@ TEXT  ·block(SB),NOSPLIT,$8-32
        MOVL    (1*4)(BP),      BX
        MOVL    (2*4)(BP),      CX
        MOVL    (3*4)(BP),      DX
+       MOVL    $0xffffffff,    R11
 
        CMPQ    SI,             DI
        JEQ     end
@@ -40,14 +41,15 @@ loop:
 
 #define ROUND1(a, b, c, d, index, const, shift) \
        XORL    c, R9; \
-       LEAL    const(a)(R8*1), a; \
+       ADDL    $const, a; \
+       ADDL    R8, a; \
        ANDL    b, R9; \
-       XORL d, R9; \
-       MOVL (index*4)(SI), R8; \
-       ADDL R9, a; \
-       ROLL $shift, a; \
-       MOVL c, R9; \
-       ADDL b, a
+       XORL    d, R9; \
+       MOVL    (index*4)(SI), R8; \
+       ADDL    R9, a; \
+       ROLL    $shift, a; \
+       MOVL    c, R9; \
+       ADDL    b, a
 
        ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
        ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
@@ -64,21 +66,23 @@ loop:
        ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
        ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
        ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
-       ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+       ROUND1(BX,CX,DX,AX, 1,0x49b40821,22);
 
-       MOVL    (1*4)(SI),      R8
        MOVL    DX,             R9
        MOVL    DX,             R10
 
+// Uses https://github.com/animetosho/md5-optimisation#dependency-shortcut-in-g-function
+
 #define ROUND2(a, b, c, d, index, const, shift) \
-       NOTL    R9; \
-       LEAL    const(a)(R8*1),a; \
+       XORL    R11, R9; \
+       ADDL    $const, a; \
+       ADDL    R8,     a; \
        ANDL    b,              R10; \
        ANDL    c,              R9; \
        MOVL    (index*4)(SI),R8; \
-       ORL     R9,             R10; \
+       ADDL    R9,     a; \
+       ADDL    R10,    a; \
        MOVL    c,              R9; \
-       ADDL    R10,            a; \
        MOVL    c,              R10; \
        ROLL    $shift, a; \
        ADDL    b,              a
@@ -98,22 +102,34 @@ loop:
        ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
        ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
        ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
-       ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+       ROUND2(BX,CX,DX,AX, 5,0x8d2a4c8a,20);
 
-       MOVL    (5*4)(SI),      R8
        MOVL    CX,             R9
 
-#define ROUND3(a, b, c, d, index, const, shift) \
-       LEAL    const(a)(R8*1),a; \
+// Uses https://github.com/animetosho/md5-optimisation#h-function-re-use
+
+#define ROUND3FIRST(a, b, c, d, index, const, shift) \
+       MOVL    d,              R9; \
+       XORL    c,              R9; \
+       XORL    b,              R9; \
+       ADDL    $const, a; \
+       ADDL    R8,             a; \
        MOVL    (index*4)(SI),R8; \
-       XORL    d,              R9; \
+       ADDL    R9,             a; \
+       ROLL    $shift,         a; \
+       ADDL    b,              a
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+       XORL    a,              R9; \
        XORL    b,              R9; \
+       ADDL    $const, a; \
+       ADDL    R8,             a; \
+       MOVL    (index*4)(SI),R8; \
        ADDL    R9,             a; \
        ROLL    $shift,         a; \
-       MOVL    b,              R9; \
        ADDL    b,              a
 
-       ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+       ROUND3FIRST(AX,BX,CX,DX, 8,0xfffa3942, 4);
        ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
        ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
        ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
@@ -130,13 +146,13 @@ loop:
        ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
        ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
 
-       MOVL    (0*4)(SI),      R8
-       MOVL    $0xffffffff,    R9
+       MOVL    R11,    R9
        XORL    DX,             R9
 
 #define ROUND4(a, b, c, d, index, const, shift) \
-       LEAL    const(a)(R8*1),a; \
-       ORL     b,              R9; \
+       ADDL    $const, a; \
+       ADDL    R8,             a; \
+       ORL             b,              R9; \
        XORL    c,              R9; \
        ADDL    R9,             a; \
        MOVL    (index*4)(SI),R8; \