]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/sha512: improve performance for sha512.block on ppc64le
authorMike Strosaker <strosake@us.ibm.com>
Sat, 29 Oct 2016 06:36:41 +0000 (02:36 -0400)
committerDavid Chase <drchase@google.com>
Mon, 31 Oct 2016 19:13:52 +0000 (19:13 +0000)
Adds an assembly implementation of sha512.block for ppc64le to improve its
performance.  This implementation is largely based on the original amd64
implementation, unrolling the 80 iterations of the inner loop.

Fixes #17660

benchmark               old ns/op     new ns/op     delta
BenchmarkHash8Bytes     1715          1133          -33.94%
BenchmarkHash1K         10098         5513          -45.41%
BenchmarkHash8K         68004         35278         -48.12%

benchmark               old MB/s     new MB/s     speedup
BenchmarkHash8Bytes     4.66         7.06         1.52x
BenchmarkHash1K         101.40       185.72       1.83x
BenchmarkHash8K         120.46       232.21       1.93x

Change-Id: Ifd55a49a24cb159b3a09a8e928c3f37727aca103
Reviewed-on: https://go-review.googlesource.com/32320
Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Reviewed-by: David Chase <drchase@google.com>
Run-TryBot: David Chase <drchase@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

src/crypto/sha512/sha512block_decl.go
src/crypto/sha512/sha512block_generic.go
src/crypto/sha512/sha512block_ppc64le.s [new file with mode: 0644]

index 47d656a7e4a1aaaae78c358037babd770ab4074f..8194506bf6d3f6d144b042d8d413ef9f346d6b1e 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 s390x
+// +build amd64 s390x ppc64le
 
 package sha512
 
index 2c691baa3d92d5b6fddfb35a646250bf3be8eb83..08f2e07178994aaa7196e3e6a2f7b31f174ca7bb 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!s390x
+// +build !amd64,!s390x,!ppc64le
 
 package sha512
 
diff --git a/src/crypto/sha512/sha512block_ppc64le.s b/src/crypto/sha512/sha512block_ppc64le.s
new file mode 100644 (file)
index 0000000..7b338d8
--- /dev/null
@@ -0,0 +1,293 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// SHA512 block routine. See sha512block.go for Go equivalent.
+//
+// The algorithm is detailed in FIPS 180-4:
+//
+//  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
+//
+// Wt = Mt; for 0 <= t <= 15
+// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
+//
+// a = H0
+// b = H1
+// c = H2
+// d = H3
+// e = H4
+// f = H5
+// g = H6
+// h = H7
+//
+// for t = 0 to 79 {
+//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
+//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
+//    h = g
+//    g = f
+//    f = e
+//    e = d + T1
+//    d = c
+//    c = b
+//    b = a
+//    a = T1 + T2
+// }
+//
+// H0 = a + H0
+// H1 = b + H1
+// H2 = c + H2
+// H3 = d + H3
+// H4 = e + H4
+// H5 = f + H5
+// H6 = g + H6
+// H7 = h + H7
+
+// Wt = Mt; for 0 <= t <= 15
+#define MSGSCHEDULE0(index) \
+       MOVD    (index*8)(R6), R14; \
+       RLWNM   $24, R14, $-1, R21; \
+       RLWMI   $8, R14, $0x00FF0000, R21; \
+       RLWMI   $8, R14, $0x000000FF, R21; \
+       SLD     $32, R21; \
+       SRD     $32, R14, R20; \
+       RLWNM   $24, R20, $-1, R14; \
+       RLWMI   $8, R20, $0x00FF0000, R14; \
+       RLWMI   $8, R20, $0x000000FF, R14; \
+       OR      R21, R14; \
+       MOVD    R14, (index*8)(R9)
+
+// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
+//   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
+//   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
+#define MSGSCHEDULE1(index) \
+       MOVD    ((index-2)*8)(R9), R14; \
+       MOVD    R14, R16; \
+       RLDCL   $64-19, R14, $-1, R14; \
+       MOVD    R16, R17; \
+       RLDCL   $64-61, R16, $-1, R16; \
+       SRD     $6, R17; \
+       MOVD    ((index-15)*8)(R9), R15; \
+       XOR     R16, R14; \
+       MOVD    R15, R16; \
+       XOR     R17, R14; \
+       RLDCL   $64-1, R15, $-1, R15; \
+       MOVD    R16, R17; \
+       SRD     $7, R17; \
+       RLDCL   $64-8, R16, $-1, R16; \
+       MOVD    ((index-7)*8)(R9), R21; \
+       ADD     R21, R14; \
+       XOR     R16, R15; \
+       XOR     R17, R15; \
+       MOVD    ((index-16)*8)(R9), R21; \
+       ADD     R21, R15; \
+       ADD     R15, R14; \
+       MOVD    R14, ((index)*8)(R9)
+
+// Calculate T1 in R14 - uses R14, R16 and R17 registers.
+// h is also used as an accumulator. Wt is passed in R14.
+//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
+//     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
+//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
+#define SHA512T1(const, e, f, g, h) \
+       MOVD    $const, R17; \
+       ADD     R14, h; \
+       MOVD    e, R14; \
+       ADD     R17, h; \
+       MOVD    e, R16; \
+       RLDCL   $64-14, R14, $-1, R14; \
+       MOVD    e, R17; \
+       RLDCL   $64-18, R16, $-1, R16; \
+       XOR     R16, R14; \
+       MOVD    e, R16; \
+       RLDCL   $64-41, R17, $-1, R17; \
+       AND     f, R16; \
+       XOR     R14, R17; \
+       MOVD    e, R14; \
+       NOR     R14, R14, R14; \
+       ADD     R17, h; \
+       AND     g, R14; \
+       XOR     R16, R14; \
+       ADD     h, R14
+
+// Calculate T2 in R15 - uses R15, R16, R17 and R8 registers.
+//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
+//     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
+//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
+#define SHA512T2(a, b, c) \
+       MOVD    a, R8; \
+       MOVD    c, R15; \
+       RLDCL   $64-28, R8, $-1, R8; \
+       MOVD    a, R17; \
+       AND     b, R15; \
+       RLDCL   $64-34, R17, $-1, R17; \
+       MOVD    a, R16; \
+       AND     c, R16; \
+       XOR     R17, R8; \
+       XOR     R16, R15; \
+       MOVD    a, R17; \
+       MOVD    b, R16; \
+       RLDCL   $64-39, R17, $-1, R17; \
+       AND     a, R16; \
+       XOR     R16, R15; \
+       XOR     R17, R8; \
+       ADD     R8, R15
+
+// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
+// The values for e and a are stored in d and h, ready for rotation.
+#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
+       SHA512T1(const, e, f, g, h); \
+       SHA512T2(a, b, c); \
+       MOVD    R15, h; \
+       ADD     R14, d; \
+       ADD     R14, h
+
+#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
+       MSGSCHEDULE0(index); \
+       SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
+
+#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
+       MSGSCHEDULE1(index); \
+       SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
+
+// func block(dig *digest, p []byte)
+TEXT ·block(SB),0,$680-32
+       MOVD    p_base+8(FP), R6
+       MOVD    p_len+16(FP), R7
+       SRD     $7, R7
+       SLD     $7, R7
+
+       ADD     R6, R7, R8
+       MOVD    R8, 640(R1)
+       CMP     R6, R8
+       BEQ     end
+
+       MOVD    dig+0(FP), R9
+       MOVD    (0*8)(R9), R22          // a = H0
+       MOVD    (1*8)(R9), R23          // b = H1
+       MOVD    (2*8)(R9), R24          // c = H2
+       MOVD    (3*8)(R9), R25          // d = H3
+       MOVD    (4*8)(R9), R26          // e = H4
+       MOVD    (5*8)(R9), R27          // f = H5
+       MOVD    (6*8)(R9), R28          // g = H6
+       MOVD    (7*8)(R9), R29          // h = H7
+
+loop:
+       MOVD    R1, R9                  // R9: message schedule
+
+       SHA512ROUND0(0, 0x428a2f98d728ae22, R22, R23, R24, R25, R26, R27, R28, R29)
+       SHA512ROUND0(1, 0x7137449123ef65cd, R29, R22, R23, R24, R25, R26, R27, R28)
+       SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R28, R29, R22, R23, R24, R25, R26, R27)
+       SHA512ROUND0(3, 0xe9b5dba58189dbbc, R27, R28, R29, R22, R23, R24, R25, R26)
+       SHA512ROUND0(4, 0x3956c25bf348b538, R26, R27, R28, R29, R22, R23, R24, R25)
+       SHA512ROUND0(5, 0x59f111f1b605d019, R25, R26, R27, R28, R29, R22, R23, R24)
+       SHA512ROUND0(6, 0x923f82a4af194f9b, R24, R25, R26, R27, R28, R29, R22, R23)
+       SHA512ROUND0(7, 0xab1c5ed5da6d8118, R23, R24, R25, R26, R27, R28, R29, R22)
+       SHA512ROUND0(8, 0xd807aa98a3030242, R22, R23, R24, R25, R26, R27, R28, R29)
+       SHA512ROUND0(9, 0x12835b0145706fbe, R29, R22, R23, R24, R25, R26, R27, R28)
+       SHA512ROUND0(10, 0x243185be4ee4b28c, R28, R29, R22, R23, R24, R25, R26, R27)
+       SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R27, R28, R29, R22, R23, R24, R25, R26)
+       SHA512ROUND0(12, 0x72be5d74f27b896f, R26, R27, R28, R29, R22, R23, R24, R25)
+       SHA512ROUND0(13, 0x80deb1fe3b1696b1, R25, R26, R27, R28, R29, R22, R23, R24)
+       SHA512ROUND0(14, 0x9bdc06a725c71235, R24, R25, R26, R27, R28, R29, R22, R23)
+       SHA512ROUND0(15, 0xc19bf174cf692694, R23, R24, R25, R26, R27, R28, R29, R22)
+
+       SHA512ROUND1(16, 0xe49b69c19ef14ad2, R22, R23, R24, R25, R26, R27, R28, R29)
+       SHA512ROUND1(17, 0xefbe4786384f25e3, R29, R22, R23, R24, R25, R26, R27, R28)
+       SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R28, R29, R22, R23, R24, R25, R26, R27)
+       SHA512ROUND1(19, 0x240ca1cc77ac9c65, R27, R28, R29, R22, R23, R24, R25, R26)
+       SHA512ROUND1(20, 0x2de92c6f592b0275, R26, R27, R28, R29, R22, R23, R24, R25)
+       SHA512ROUND1(21, 0x4a7484aa6ea6e483, R25, R26, R27, R28, R29, R22, R23, R24)
+       SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R24, R25, R26, R27, R28, R29, R22, R23)
+       SHA512ROUND1(23, 0x76f988da831153b5, R23, R24, R25, R26, R27, R28, R29, R22)
+       SHA512ROUND1(24, 0x983e5152ee66dfab, R22, R23, R24, R25, R26, R27, R28, R29)
+       SHA512ROUND1(25, 0xa831c66d2db43210, R29, R22, R23, R24, R25, R26, R27, R28)
+       SHA512ROUND1(26, 0xb00327c898fb213f, R28, R29, R22, R23, R24, R25, R26, R27)
+       SHA512ROUND1(27, 0xbf597fc7beef0ee4, R27, R28, R29, R22, R23, R24, R25, R26)
+       SHA512ROUND1(28, 0xc6e00bf33da88fc2, R26, R27, R28, R29, R22, R23, R24, R25)
+       SHA512ROUND1(29, 0xd5a79147930aa725, R25, R26, R27, R28, R29, R22, R23, R24)
+       SHA512ROUND1(30, 0x06ca6351e003826f, R24, R25, R26, R27, R28, R29, R22, R23)
+       SHA512ROUND1(31, 0x142929670a0e6e70, R23, R24, R25, R26, R27, R28, R29, R22)
+       SHA512ROUND1(32, 0x27b70a8546d22ffc, R22, R23, R24, R25, R26, R27, R28, R29)
+       SHA512ROUND1(33, 0x2e1b21385c26c926, R29, R22, R23, R24, R25, R26, R27, R28)
+       SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R28, R29, R22, R23, R24, R25, R26, R27)
+       SHA512ROUND1(35, 0x53380d139d95b3df, R27, R28, R29, R22, R23, R24, R25, R26)
+       SHA512ROUND1(36, 0x650a73548baf63de, R26, R27, R28, R29, R22, R23, R24, R25)
+       SHA512ROUND1(37, 0x766a0abb3c77b2a8, R25, R26, R27, R28, R29, R22, R23, R24)
+       SHA512ROUND1(38, 0x81c2c92e47edaee6, R24, R25, R26, R27, R28, R29, R22, R23)
+       SHA512ROUND1(39, 0x92722c851482353b, R23, R24, R25, R26, R27, R28, R29, R22)
+       SHA512ROUND1(40, 0xa2bfe8a14cf10364, R22, R23, R24, R25, R26, R27, R28, R29)
+       SHA512ROUND1(41, 0xa81a664bbc423001, R29, R22, R23, R24, R25, R26, R27, R28)
+       SHA512ROUND1(42, 0xc24b8b70d0f89791, R28, R29, R22, R23, R24, R25, R26, R27)
+       SHA512ROUND1(43, 0xc76c51a30654be30, R27, R28, R29, R22, R23, R24, R25, R26)
+       SHA512ROUND1(44, 0xd192e819d6ef5218, R26, R27, R28, R29, R22, R23, R24, R25)
+       SHA512ROUND1(45, 0xd69906245565a910, R25, R26, R27, R28, R29, R22, R23, R24)
+       SHA512ROUND1(46, 0xf40e35855771202a, R24, R25, R26, R27, R28, R29, R22, R23)
+       SHA512ROUND1(47, 0x106aa07032bbd1b8, R23, R24, R25, R26, R27, R28, R29, R22)
+       SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R22, R23, R24, R25, R26, R27, R28, R29)
+       SHA512ROUND1(49, 0x1e376c085141ab53, R29, R22, R23, R24, R25, R26, R27, R28)
+       SHA512ROUND1(50, 0x2748774cdf8eeb99, R28, R29, R22, R23, R24, R25, R26, R27)
+       SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R27, R28, R29, R22, R23, R24, R25, R26)
+       SHA512ROUND1(52, 0x391c0cb3c5c95a63, R26, R27, R28, R29, R22, R23, R24, R25)
+       SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R25, R26, R27, R28, R29, R22, R23, R24)
+       SHA512ROUND1(54, 0x5b9cca4f7763e373, R24, R25, R26, R27, R28, R29, R22, R23)
+       SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R23, R24, R25, R26, R27, R28, R29, R22)
+       SHA512ROUND1(56, 0x748f82ee5defb2fc, R22, R23, R24, R25, R26, R27, R28, R29)
+       SHA512ROUND1(57, 0x78a5636f43172f60, R29, R22, R23, R24, R25, R26, R27, R28)
+       SHA512ROUND1(58, 0x84c87814a1f0ab72, R28, R29, R22, R23, R24, R25, R26, R27)
+       SHA512ROUND1(59, 0x8cc702081a6439ec, R27, R28, R29, R22, R23, R24, R25, R26)
+       SHA512ROUND1(60, 0x90befffa23631e28, R26, R27, R28, R29, R22, R23, R24, R25)
+       SHA512ROUND1(61, 0xa4506cebde82bde9, R25, R26, R27, R28, R29, R22, R23, R24)
+       SHA512ROUND1(62, 0xbef9a3f7b2c67915, R24, R25, R26, R27, R28, R29, R22, R23)
+       SHA512ROUND1(63, 0xc67178f2e372532b, R23, R24, R25, R26, R27, R28, R29, R22)
+       SHA512ROUND1(64, 0xca273eceea26619c, R22, R23, R24, R25, R26, R27, R28, R29)
+       SHA512ROUND1(65, 0xd186b8c721c0c207, R29, R22, R23, R24, R25, R26, R27, R28)
+       SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R28, R29, R22, R23, R24, R25, R26, R27)
+       SHA512ROUND1(67, 0xf57d4f7fee6ed178, R27, R28, R29, R22, R23, R24, R25, R26)
+       SHA512ROUND1(68, 0x06f067aa72176fba, R26, R27, R28, R29, R22, R23, R24, R25)
+       SHA512ROUND1(69, 0x0a637dc5a2c898a6, R25, R26, R27, R28, R29, R22, R23, R24)
+       SHA512ROUND1(70, 0x113f9804bef90dae, R24, R25, R26, R27, R28, R29, R22, R23)
+       SHA512ROUND1(71, 0x1b710b35131c471b, R23, R24, R25, R26, R27, R28, R29, R22)
+       SHA512ROUND1(72, 0x28db77f523047d84, R22, R23, R24, R25, R26, R27, R28, R29)
+       SHA512ROUND1(73, 0x32caab7b40c72493, R29, R22, R23, R24, R25, R26, R27, R28)
+       SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R28, R29, R22, R23, R24, R25, R26, R27)
+       SHA512ROUND1(75, 0x431d67c49c100d4c, R27, R28, R29, R22, R23, R24, R25, R26)
+       SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R26, R27, R28, R29, R22, R23, R24, R25)
+       SHA512ROUND1(77, 0x597f299cfc657e2a, R25, R26, R27, R28, R29, R22, R23, R24)
+       SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R24, R25, R26, R27, R28, R29, R22, R23)
+       SHA512ROUND1(79, 0x6c44198c4a475817, R23, R24, R25, R26, R27, R28, R29, R22)
+
+       MOVD    dig+0(FP), R9
+       MOVD    (0*8)(R9), R21
+       ADD     R21, R22        // H0 = a + H0
+       MOVD    R22, (0*8)(R9)
+       MOVD    (1*8)(R9), R21
+       ADD     R21, R23        // H1 = b + H1
+       MOVD    R23, (1*8)(R9)
+       MOVD    (2*8)(R9), R21
+       ADD     R21, R24        // H2 = c + H2
+       MOVD    R24, (2*8)(R9)
+       MOVD    (3*8)(R9), R21
+       ADD     R21, R25        // H3 = d + H3
+       MOVD    R25, (3*8)(R9)
+       MOVD    (4*8)(R9), R21
+       ADD     R21, R26        // H4 = e + H4
+       MOVD    R26, (4*8)(R9)
+       MOVD    (5*8)(R9), R21
+       ADD     R21, R27        // H5 = f + H5
+       MOVD    R27, (5*8)(R9)
+       MOVD    (6*8)(R9), R21
+       ADD     R21, R28        // H6 = g + H6
+       MOVD    R28, (6*8)(R9)
+       MOVD    (7*8)(R9), R21
+       ADD     R21, R29        // H7 = h + H7
+       MOVD    R29, (7*8)(R9)
+
+       ADD     $128, R6
+       MOVD    640(R1), R21
+       CMPU    R6, R21
+       BLT     loop
+
+end:
+       RET