]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/md5: improve ppc64x performance
authorPaul E. Murphy <murp@ibm.com>
Wed, 10 Mar 2021 23:06:54 +0000 (17:06 -0600)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Mon, 15 Mar 2021 12:30:38 +0000 (12:30 +0000)
This is mostly cleanup and simplification.  This removes
many unneeded register moves, loads, and bit twiddlings
which were holdovers from porting this from the amd64
version.

The updated code loads each block once per iteration
instead of once per round. Similarly, the logical
operations now match the original md5 specification.

Likewise, add extra sizes to the benchtest to give more
data points on how the implementation scales with input
size.

All in all, this is roughly a 20% improvement on ppc64le
code running on POWER9 (POWER8 is similar, but around
16%):

name                 old time/op    new time/op    delta
Hash8Bytes              297ns ± 0%     255ns ± 0%  -14.14%
Hash64                  527ns ± 0%     444ns ± 0%  -15.76%
Hash128                 771ns ± 0%     645ns ± 0%  -16.35%
Hash256                1.26µs ± 0%    1.05µs ± 0%  -16.68%
Hash512                2.23µs ± 0%    1.85µs ± 0%  -16.82%
Hash1K                 4.16µs ± 0%    3.46µs ± 0%  -16.83%
Hash8K                 31.2µs ± 0%    26.0µs ± 0%  -16.74%
Hash1M                 3.58ms ± 0%    2.98ms ± 0%  -16.74%
Hash8M                 26.1ms ± 0%    21.7ms ± 0%  -16.81%
Hash8BytesUnaligned     297ns ± 0%     255ns ± 0%  -14.08%
Hash1KUnaligned        4.16µs ± 0%    3.46µs ± 0%  -16.79%
Hash8KUnaligned        31.2µs ± 0%    26.0µs ± 0%  -16.78%

name                 old speed      new speed      delta
Hash8Bytes           26.9MB/s ± 0%  31.4MB/s ± 0%  +16.45%
Hash64                122MB/s ± 0%   144MB/s ± 0%  +18.69%
Hash128               166MB/s ± 0%   199MB/s ± 0%  +19.54%
Hash256               203MB/s ± 0%   244MB/s ± 0%  +20.01%
Hash512               230MB/s ± 0%   276MB/s ± 0%  +20.18%
Hash1K                246MB/s ± 0%   296MB/s ± 0%  +20.26%
Hash8K                263MB/s ± 0%   315MB/s ± 0%  +20.11%
Hash1M                293MB/s ± 0%   352MB/s ± 0%  +20.10%
Hash8M                321MB/s ± 0%   386MB/s ± 0%  +20.21%
Hash8BytesUnaligned  26.9MB/s ± 0%  31.4MB/s ± 0%  +16.41%
Hash1KUnaligned       246MB/s ± 0%   296MB/s ± 0%  +20.19%
Hash8KUnaligned       263MB/s ± 0%   315MB/s ± 0%  +20.15%

Change-Id: I269bfa6878966bb4f6a64dc349100f5dc453ab7c
Reviewed-on: https://go-review.googlesource.com/c/go/+/300613
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Emmanuel Odeke <emmanuel@orijtech.com>

src/crypto/md5/md5_test.go
src/crypto/md5/md5block_ppc64x.s

index c0ac0971c460dad410020ab7d4572b9ba841f0ca..acd456af2151782686a90f6acb20b4a736cde8a3 100644 (file)
@@ -212,7 +212,7 @@ func TestLargeHashes(t *testing.T) {
 }
 
 var bench = New()
-var buf = make([]byte, 8192+1)
+var buf = make([]byte, 1024*1024*8+1)
 var sum = make([]byte, bench.Size())
 
 func benchmarkSize(b *testing.B, size int, unaligned bool) {
@@ -235,6 +235,22 @@ func BenchmarkHash8Bytes(b *testing.B) {
        benchmarkSize(b, 8, false)
 }
 
+func BenchmarkHash64(b *testing.B) {
+       benchmarkSize(b, 64, false)
+}
+
+func BenchmarkHash128(b *testing.B) {
+       benchmarkSize(b, 128, false)
+}
+
+func BenchmarkHash256(b *testing.B) {
+       benchmarkSize(b, 256, false)
+}
+
+func BenchmarkHash512(b *testing.B) {
+       benchmarkSize(b, 512, false)
+}
+
 func BenchmarkHash1K(b *testing.B) {
        benchmarkSize(b, 1024, false)
 }
@@ -243,6 +259,14 @@ func BenchmarkHash8K(b *testing.B) {
        benchmarkSize(b, 8192, false)
 }
 
+func BenchmarkHash1M(b *testing.B) {
+       benchmarkSize(b, 1024*1024, false)
+}
+
+func BenchmarkHash8M(b *testing.B) {
+       benchmarkSize(b, 8*1024*1024, false)
+}
+
 func BenchmarkHash8BytesUnaligned(b *testing.B) {
        benchmarkSize(b, 8, true)
 }
index f309a1413d7c7ad9c080805960407f812c3820f7..e1f859e3376bddcb9dbc2d00a507f0ffb9c8ae34 100644 (file)
        MOVWBR  (idx)(ptr), dst
 #endif
 
-TEXT ·block(SB),NOSPLIT,$0-32
-       MOVD    dig+0(FP), R10
-       MOVD    p+8(FP), R6
-       MOVD    p_len+16(FP), R5
-       SLD     $6, R5
-       SRD     $6, R5
-       ADD     R6, R5, R7
-
-       MOVWZ   0(R10), R22
-       MOVWZ   4(R10), R3
-       MOVWZ   8(R10), R4
-       MOVWZ   12(R10), R5
-       CMP     R6, R7
-       BEQ     end
-
-loop:
-       MOVWZ   R22, R14
-       MOVWZ   R3, R15
-       MOVWZ   R4, R16
-       MOVWZ   R5, R17
-
-       ENDIAN_MOVE(0,R6,R8,R21)
-       MOVWZ   R5, R9
+#define M00 R18
+#define M01 R19
+#define M02 R20
+#define M03 R24
+#define M04 R25
+#define M05 R26
+#define M06 R27
+#define M07 R28
+#define M08 R29
+#define M09 R21
+#define M10 R11
+#define M11 R8
+#define M12 R7
+#define M13 R12
+#define M14 R23
+#define M15 R10
 
 #define ROUND1(a, b, c, d, index, const, shift) \
-       XOR     c, R9; \
-       ADD     $const, a; \
-       ADD     R8, a; \
-       AND     b, R9; \
-       XOR     d, R9; \
-       ENDIAN_MOVE(index*4,R6,R8,R21); \
+       ADD     $const, index, R9; \
        ADD     R9, a; \
-       RLWMI   $shift, a, $0xffffffff, a; \
-       MOVWZ   c, R9; \
-       ADD     b, a; \
-       MOVWZ   a, a
-
-       ROUND1(R22,R3,R4,R5, 1,0xd76aa478, 7);
-       ROUND1(R5,R22,R3,R4, 2,0xe8c7b756,12);
-       ROUND1(R4,R5,R22,R3, 3,0x242070db,17);
-       ROUND1(R3,R4,R5,R22, 4,0xc1bdceee,22);
-       ROUND1(R22,R3,R4,R5, 5,0xf57c0faf, 7);
-       ROUND1(R5,R22,R3,R4, 6,0x4787c62a,12);
-       ROUND1(R4,R5,R22,R3, 7,0xa8304613,17);
-       ROUND1(R3,R4,R5,R22, 8,0xfd469501,22);
-       ROUND1(R22,R3,R4,R5, 9,0x698098d8, 7);
-       ROUND1(R5,R22,R3,R4,10,0x8b44f7af,12);
-       ROUND1(R4,R5,R22,R3,11,0xffff5bb1,17);
-       ROUND1(R3,R4,R5,R22,12,0x895cd7be,22);
-       ROUND1(R22,R3,R4,R5,13,0x6b901122, 7);
-       ROUND1(R5,R22,R3,R4,14,0xfd987193,12);
-       ROUND1(R4,R5,R22,R3,15,0xa679438e,17);
-       ROUND1(R3,R4,R5,R22, 0,0x49b40821,22);
-
-       ENDIAN_MOVE(1*4,R6,R8,R21)
-       MOVWZ   R5, R9
-       MOVWZ   R5, R10
+       AND     b, c, R9; \
+       ANDN    b, d, R31; \
+       OR      R9, R31, R9; \
+       ADD     R9, a; \
+       ROTLW   $shift, a; \
+       ADD     b, a;
 
 #define ROUND2(a, b, c, d, index, const, shift) \
-       XOR     $0xffffffff, R9; \ // NOTW R9
-       ADD     $const, a; \
-       ADD     R8, a; \
-       AND     b, R10; \
-       AND     c, R9; \
-       ENDIAN_MOVE(index*4,R6,R8,R21); \
-       OR      R9, R10; \
-       MOVWZ   c, R9; \
-       ADD     R10, a; \
-       MOVWZ   c, R10; \
-       RLWMI   $shift, a, $0xffffffff, a; \
-       ADD     b, a; \
-       MOVWZ   a, a
-
-       ROUND2(R22,R3,R4,R5, 6,0xf61e2562, 5);
-       ROUND2(R5,R22,R3,R4,11,0xc040b340, 9);
-       ROUND2(R4,R5,R22,R3, 0,0x265e5a51,14);
-       ROUND2(R3,R4,R5,R22, 5,0xe9b6c7aa,20);
-       ROUND2(R22,R3,R4,R5,10,0xd62f105d, 5);
-       ROUND2(R5,R22,R3,R4,15, 0x2441453, 9);
-       ROUND2(R4,R5,R22,R3, 4,0xd8a1e681,14);
-       ROUND2(R3,R4,R5,R22, 9,0xe7d3fbc8,20);
-       ROUND2(R22,R3,R4,R5,14,0x21e1cde6, 5);
-       ROUND2(R5,R22,R3,R4, 3,0xc33707d6, 9);
-       ROUND2(R4,R5,R22,R3, 8,0xf4d50d87,14);
-       ROUND2(R3,R4,R5,R22,13,0x455a14ed,20);
-       ROUND2(R22,R3,R4,R5, 2,0xa9e3e905, 5);
-       ROUND2(R5,R22,R3,R4, 7,0xfcefa3f8, 9);
-       ROUND2(R4,R5,R22,R3,12,0x676f02d9,14);
-       ROUND2(R3,R4,R5,R22, 0,0x8d2a4c8a,20);
-
-       ENDIAN_MOVE(5*4,R6,R8,R21)
-       MOVWZ   R4, R9
+       ADD     $const, index, R9; \
+       ADD     R9, a; \
+       AND     b, d, R31; \
+       ANDN    d, c, R9; \
+       OR      R9, R31; \
+       ADD     R31, a; \
+       ROTLW   $shift, a; \
+       ADD     b, a;
 
 #define ROUND3(a, b, c, d, index, const, shift) \
-       ADD     $const, a; \
-       ADD     R8, a; \
-       ENDIAN_MOVE(index*4,R6,R8,R21); \
-       XOR     d, R9; \
-       XOR     b, R9; \
+       ADD     $const, index, R9; \
        ADD     R9, a; \
-       RLWMI   $shift, a, $0xffffffff, a; \
-       MOVWZ   b, R9; \
-       ADD     b, a; \
-       MOVWZ   a, a
-
-       ROUND3(R22,R3,R4,R5, 8,0xfffa3942, 4);
-       ROUND3(R5,R22,R3,R4,11,0x8771f681,11);
-       ROUND3(R4,R5,R22,R3,14,0x6d9d6122,16);
-       ROUND3(R3,R4,R5,R22, 1,0xfde5380c,23);
-       ROUND3(R22,R3,R4,R5, 4,0xa4beea44, 4);
-       ROUND3(R5,R22,R3,R4, 7,0x4bdecfa9,11);
-       ROUND3(R4,R5,R22,R3,10,0xf6bb4b60,16);
-       ROUND3(R3,R4,R5,R22,13,0xbebfbc70,23);
-       ROUND3(R22,R3,R4,R5, 0,0x289b7ec6, 4);
-       ROUND3(R5,R22,R3,R4, 3,0xeaa127fa,11);
-       ROUND3(R4,R5,R22,R3, 6,0xd4ef3085,16);
-       ROUND3(R3,R4,R5,R22, 9, 0x4881d05,23);
-       ROUND3(R22,R3,R4,R5,12,0xd9d4d039, 4);
-       ROUND3(R5,R22,R3,R4,15,0xe6db99e5,11);
-       ROUND3(R4,R5,R22,R3, 2,0x1fa27cf8,16);
-       ROUND3(R3,R4,R5,R22, 0,0xc4ac5665,23);
-
-       ENDIAN_MOVE(0,R6,R8,R21)
-       MOVWZ   $0xffffffff, R9
-       XOR     R5, R9
+       XOR     d, c, R31; \
+       XOR     b, R31; \
+       ADD     R31, a; \
+       ROTLW   $shift, a; \
+       ADD     b, a;
 
 #define ROUND4(a, b, c, d, index, const, shift) \
-       ADD     $const, a; \
-       ADD     R8, a; \
-       OR      b, R9; \
-       XOR     c, R9; \
+       ADD     $const, index, R9; \
        ADD     R9, a; \
-       ENDIAN_MOVE(index*4,R6,R8,R21); \
-       MOVWZ   $0xffffffff, R9; \
-       RLWMI   $shift, a, $0xffffffff, a; \
-       XOR     c, R9; \
-       ADD     b, a; \
-       MOVWZ   a, a
-
-       ROUND4(R22,R3,R4,R5, 7,0xf4292244, 6);
-       ROUND4(R5,R22,R3,R4,14,0x432aff97,10);
-       ROUND4(R4,R5,R22,R3, 5,0xab9423a7,15);
-       ROUND4(R3,R4,R5,R22,12,0xfc93a039,21);
-       ROUND4(R22,R3,R4,R5, 3,0x655b59c3, 6);
-       ROUND4(R5,R22,R3,R4,10,0x8f0ccc92,10);
-       ROUND4(R4,R5,R22,R3, 1,0xffeff47d,15);
-       ROUND4(R3,R4,R5,R22, 8,0x85845dd1,21);
-       ROUND4(R22,R3,R4,R5,15,0x6fa87e4f, 6);
-       ROUND4(R5,R22,R3,R4, 6,0xfe2ce6e0,10);
-       ROUND4(R4,R5,R22,R3,13,0xa3014314,15);
-       ROUND4(R3,R4,R5,R22, 4,0x4e0811a1,21);
-       ROUND4(R22,R3,R4,R5,11,0xf7537e82, 6);
-       ROUND4(R5,R22,R3,R4, 2,0xbd3af235,10);
-       ROUND4(R4,R5,R22,R3, 9,0x2ad7d2bb,15);
-       ROUND4(R3,R4,R5,R22, 0,0xeb86d391,21);
+       ORN     d, b, R31; \
+       XOR     c, R31; \
+       ADD     R31, a; \
+       ROTLW   $shift, a; \
+       ADD     b, a;
+
+
+TEXT ·block(SB),NOSPLIT,$0-32
+       MOVD    dig+0(FP), R10
+       MOVD    p+8(FP), R6
+       MOVD    p_len+16(FP), R5
+
+       // We assume p_len >= 64
+       SRD     $6, R5
+       MOVD    R5, CTR
+
+       MOVWZ   0(R10), R22
+       MOVWZ   4(R10), R3
+       MOVWZ   8(R10), R4
+       MOVWZ   12(R10), R5
+
+loop:
+       MOVD    R22, R14
+       MOVD    R3, R15
+       MOVD    R4, R16
+       MOVD    R5, R17
+
+       ENDIAN_MOVE( 0,R6,M00,M15)
+       ENDIAN_MOVE( 4,R6,M01,M15)
+       ENDIAN_MOVE( 8,R6,M02,M15)
+       ENDIAN_MOVE(12,R6,M03,M15)
+
+       ROUND1(R22,R3,R4,R5,M00,0xd76aa478, 7);
+       ROUND1(R5,R22,R3,R4,M01,0xe8c7b756,12);
+       ROUND1(R4,R5,R22,R3,M02,0x242070db,17);
+       ROUND1(R3,R4,R5,R22,M03,0xc1bdceee,22);
+
+       ENDIAN_MOVE(16,R6,M04,M15)
+       ENDIAN_MOVE(20,R6,M05,M15)
+       ENDIAN_MOVE(24,R6,M06,M15)
+       ENDIAN_MOVE(28,R6,M07,M15)
+
+       ROUND1(R22,R3,R4,R5,M04,0xf57c0faf, 7);
+       ROUND1(R5,R22,R3,R4,M05,0x4787c62a,12);
+       ROUND1(R4,R5,R22,R3,M06,0xa8304613,17);
+       ROUND1(R3,R4,R5,R22,M07,0xfd469501,22);
+
+       ENDIAN_MOVE(32,R6,M08,M15)
+       ENDIAN_MOVE(36,R6,M09,M15)
+       ENDIAN_MOVE(40,R6,M10,M15)
+       ENDIAN_MOVE(44,R6,M11,M15)
+
+       ROUND1(R22,R3,R4,R5,M08,0x698098d8, 7);
+       ROUND1(R5,R22,R3,R4,M09,0x8b44f7af,12);
+       ROUND1(R4,R5,R22,R3,M10,0xffff5bb1,17);
+       ROUND1(R3,R4,R5,R22,M11,0x895cd7be,22);
+
+       ENDIAN_MOVE(48,R6,M12,M15)
+       ENDIAN_MOVE(52,R6,M13,M15)
+       ENDIAN_MOVE(56,R6,M14,M15)
+       ENDIAN_MOVE(60,R6,M15,M15)
+
+       ROUND1(R22,R3,R4,R5,M12,0x6b901122, 7);
+       ROUND1(R5,R22,R3,R4,M13,0xfd987193,12);
+       ROUND1(R4,R5,R22,R3,M14,0xa679438e,17);
+       ROUND1(R3,R4,R5,R22,M15,0x49b40821,22);
+
+       ROUND2(R22,R3,R4,R5,M01,0xf61e2562, 5);
+       ROUND2(R5,R22,R3,R4,M06,0xc040b340, 9);
+       ROUND2(R4,R5,R22,R3,M11,0x265e5a51,14);
+       ROUND2(R3,R4,R5,R22,M00,0xe9b6c7aa,20);
+       ROUND2(R22,R3,R4,R5,M05,0xd62f105d, 5);
+       ROUND2(R5,R22,R3,R4,M10, 0x2441453, 9);
+       ROUND2(R4,R5,R22,R3,M15,0xd8a1e681,14);
+       ROUND2(R3,R4,R5,R22,M04,0xe7d3fbc8,20);
+       ROUND2(R22,R3,R4,R5,M09,0x21e1cde6, 5);
+       ROUND2(R5,R22,R3,R4,M14,0xc33707d6, 9);
+       ROUND2(R4,R5,R22,R3,M03,0xf4d50d87,14);
+       ROUND2(R3,R4,R5,R22,M08,0x455a14ed,20);
+       ROUND2(R22,R3,R4,R5,M13,0xa9e3e905, 5);
+       ROUND2(R5,R22,R3,R4,M02,0xfcefa3f8, 9);
+       ROUND2(R4,R5,R22,R3,M07,0x676f02d9,14);
+       ROUND2(R3,R4,R5,R22,M12,0x8d2a4c8a,20);
+
+       ROUND3(R22,R3,R4,R5,M05,0xfffa3942, 4);
+       ROUND3(R5,R22,R3,R4,M08,0x8771f681,11);
+       ROUND3(R4,R5,R22,R3,M11,0x6d9d6122,16);
+       ROUND3(R3,R4,R5,R22,M14,0xfde5380c,23);
+       ROUND3(R22,R3,R4,R5,M01,0xa4beea44, 4);
+       ROUND3(R5,R22,R3,R4,M04,0x4bdecfa9,11);
+       ROUND3(R4,R5,R22,R3,M07,0xf6bb4b60,16);
+       ROUND3(R3,R4,R5,R22,M10,0xbebfbc70,23);
+       ROUND3(R22,R3,R4,R5,M13,0x289b7ec6, 4);
+       ROUND3(R5,R22,R3,R4,M00,0xeaa127fa,11);
+       ROUND3(R4,R5,R22,R3,M03,0xd4ef3085,16);
+       ROUND3(R3,R4,R5,R22,M06, 0x4881d05,23);
+       ROUND3(R22,R3,R4,R5,M09,0xd9d4d039, 4);
+       ROUND3(R5,R22,R3,R4,M12,0xe6db99e5,11);
+       ROUND3(R4,R5,R22,R3,M15,0x1fa27cf8,16);
+       ROUND3(R3,R4,R5,R22,M02,0xc4ac5665,23);
+
+       ROUND4(R22,R3,R4,R5,M00,0xf4292244, 6);
+       ROUND4(R5,R22,R3,R4,M07,0x432aff97,10);
+       ROUND4(R4,R5,R22,R3,M14,0xab9423a7,15);
+       ROUND4(R3,R4,R5,R22,M05,0xfc93a039,21);
+       ROUND4(R22,R3,R4,R5,M12,0x655b59c3, 6);
+       ROUND4(R5,R22,R3,R4,M03,0x8f0ccc92,10);
+       ROUND4(R4,R5,R22,R3,M10,0xffeff47d,15);
+       ROUND4(R3,R4,R5,R22,M01,0x85845dd1,21);
+       ROUND4(R22,R3,R4,R5,M08,0x6fa87e4f, 6);
+       ROUND4(R5,R22,R3,R4,M15,0xfe2ce6e0,10);
+       ROUND4(R4,R5,R22,R3,M06,0xa3014314,15);
+       ROUND4(R3,R4,R5,R22,M13,0x4e0811a1,21);
+       ROUND4(R22,R3,R4,R5,M04,0xf7537e82, 6);
+       ROUND4(R5,R22,R3,R4,M11,0xbd3af235,10);
+       ROUND4(R4,R5,R22,R3,M02,0x2ad7d2bb,15);
+       ROUND4(R3,R4,R5,R22,M09,0xeb86d391,21);
 
        ADD     R14, R22
        ADD     R15, R3
        ADD     R16, R4
        ADD     R17, R5
        ADD     $64, R6
-       CMP     R6, R7
-       BLT     loop
+       BC      16, 0, loop // bdnz
 
 end:
        MOVD    dig+0(FP), R10
@@ -198,4 +208,5 @@ end:
        MOVWZ   R3, 4(R10)
        MOVWZ   R4, 8(R10)
        MOVWZ   R5, 12(R10)
+
        RET