From 854ae03db4c6c398149dc1e46fbec49479d8cb1f Mon Sep 17 00:00:00 2001 From: Mike Strosaker Date: Sat, 29 Oct 2016 02:36:41 -0400 Subject: [PATCH] crypto/sha512: improve performance for sha512.block on ppc64le Adds an assembly implementation of sha512.block for ppc64le to improve its performance. This implementation is largely based on the original amd64 implementation, unrolling the 80 iterations of the inner loop. Fixes #17660 benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 1715 1133 -33.94% BenchmarkHash1K 10098 5513 -45.41% BenchmarkHash8K 68004 35278 -48.12% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 4.66 7.06 1.52x BenchmarkHash1K 101.40 185.72 1.83x BenchmarkHash8K 120.46 232.21 1.93x Change-Id: Ifd55a49a24cb159b3a09a8e928c3f37727aca103 Reviewed-on: https://go-review.googlesource.com/32320 Reviewed-by: Carlos Eduardo Seo Reviewed-by: David Chase Run-TryBot: David Chase TryBot-Result: Gobot Gobot --- src/crypto/sha512/sha512block_decl.go | 2 +- src/crypto/sha512/sha512block_generic.go | 2 +- src/crypto/sha512/sha512block_ppc64le.s | 293 +++++++++++++++++++++++ 3 files changed, 295 insertions(+), 2 deletions(-) create mode 100644 src/crypto/sha512/sha512block_ppc64le.s diff --git a/src/crypto/sha512/sha512block_decl.go b/src/crypto/sha512/sha512block_decl.go index 47d656a7e4..8194506bf6 100644 --- a/src/crypto/sha512/sha512block_decl.go +++ b/src/crypto/sha512/sha512block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64 s390x +// +build amd64 s390x ppc64le package sha512 diff --git a/src/crypto/sha512/sha512block_generic.go b/src/crypto/sha512/sha512block_generic.go index 2c691baa3d..08f2e07178 100644 --- a/src/crypto/sha512/sha512block_generic.go +++ b/src/crypto/sha512/sha512block_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!s390x +// +build !amd64,!s390x,!ppc64le package sha512 diff --git a/src/crypto/sha512/sha512block_ppc64le.s b/src/crypto/sha512/sha512block_ppc64le.s new file mode 100644 index 0000000000..7b338d89f0 --- /dev/null +++ b/src/crypto/sha512/sha512block_ppc64le.s @@ -0,0 +1,293 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// SHA512 block routine. See sha512block.go for Go equivalent. +// +// The algorithm is detailed in FIPS 180-4: +// +// http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf +// +// Wt = Mt; for 0 <= t <= 15 +// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79 +// +// a = H0 +// b = H1 +// c = H2 +// d = H3 +// e = H4 +// f = H5 +// g = H6 +// h = H7 +// +// for t = 0 to 79 { +// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt +// T2 = BIGSIGMA0(a) + Maj(a,b,c) +// h = g +// g = f +// f = e +// e = d + T1 +// d = c +// c = b +// b = a +// a = T1 + T2 +// } +// +// H0 = a + H0 +// H1 = b + H1 +// H2 = c + H2 +// H3 = d + H3 +// H4 = e + H4 +// H5 = f + H5 +// H6 = g + H6 +// H7 = h + H7 + +// Wt = Mt; for 0 <= t <= 15 +#define MSGSCHEDULE0(index) \ + MOVD (index*8)(R6), R14; \ + RLWNM $24, R14, $-1, R21; \ + RLWMI $8, R14, $0x00FF0000, R21; \ + RLWMI $8, R14, $0x000000FF, R21; \ + SLD $32, R21; \ + SRD $32, R14, R20; \ + RLWNM $24, R20, $-1, R14; \ + RLWMI $8, R20, $0x00FF0000, R14; \ + RLWMI $8, R20, $0x000000FF, R14; \ + OR R21, R14; \ + MOVD R14, (index*8)(R9) + +// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79 +// SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x) +// SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x) +#define MSGSCHEDULE1(index) \ + MOVD ((index-2)*8)(R9), R14; \ + MOVD R14, R16; \ + RLDCL $64-19, R14, $-1, R14; \ + MOVD R16, R17; \ + RLDCL $64-61, R16, $-1, R16; \ + SRD $6, R17; \ + MOVD ((index-15)*8)(R9), R15; \ + XOR R16, R14; \ + MOVD R15, R16; \ + XOR R17, R14; \ + RLDCL $64-1, R15, $-1, R15; \ + MOVD R16, R17; \ + SRD $7, R17; \ + RLDCL $64-8, R16, $-1, R16; \ + MOVD ((index-7)*8)(R9), R21; \ + ADD R21, R14; \ + XOR R16, R15; \ + XOR R17, R15; \ + MOVD ((index-16)*8)(R9), R21; \ + ADD R21, R15; \ + ADD R15, R14; \ + MOVD R14, ((index)*8)(R9) + +// Calculate T1 in R14 - uses R14, R16 and R17 registers. +// h is also used as an accumulator. Wt is passed in R14. +// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt +// BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x) +// Ch(x, y, z) = (x AND y) XOR (NOT x AND z) +#define SHA512T1(const, e, f, g, h) \ + MOVD $const, R17; \ + ADD R14, h; \ + MOVD e, R14; \ + ADD R17, h; \ + MOVD e, R16; \ + RLDCL $64-14, R14, $-1, R14; \ + MOVD e, R17; \ + RLDCL $64-18, R16, $-1, R16; \ + XOR R16, R14; \ + MOVD e, R16; \ + RLDCL $64-41, R17, $-1, R17; \ + AND f, R16; \ + XOR R14, R17; \ + MOVD e, R14; \ + NOR R14, R14, R14; \ + ADD R17, h; \ + AND g, R14; \ + XOR R16, R14; \ + ADD h, R14 + +// Calculate T2 in R15 - uses R15, R16, R17 and R8 registers. +// T2 = BIGSIGMA0(a) + Maj(a, b, c) +// BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x) +// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) +#define SHA512T2(a, b, c) \ + MOVD a, R8; \ + MOVD c, R15; \ + RLDCL $64-28, R8, $-1, R8; \ + MOVD a, R17; \ + AND b, R15; \ + RLDCL $64-34, R17, $-1, R17; \ + MOVD a, R16; \ + AND c, R16; \ + XOR R17, R8; \ + XOR R16, R15; \ + MOVD a, R17; \ + MOVD b, R16; \ + RLDCL $64-39, R17, $-1, R17; \ + AND a, R16; \ + XOR R16, R15; \ + XOR R17, R8; \ + ADD R8, R15 + +// Calculate T1 and T2, then e = d + T1 and a = T1 + T2. +// The values for e and a are stored in d and h, ready for rotation. +#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \ + SHA512T1(const, e, f, g, h); \ + SHA512T2(a, b, c); \ + MOVD R15, h; \ + ADD R14, d; \ + ADD R14, h + +#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \ + MSGSCHEDULE0(index); \ + SHA512ROUND(index, const, a, b, c, d, e, f, g, h) + +#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \ + MSGSCHEDULE1(index); \ + SHA512ROUND(index, const, a, b, c, d, e, f, g, h) + +// func block(dig *digest, p []byte) +TEXT ·block(SB),0,$680-32 + MOVD p_base+8(FP), R6 + MOVD p_len+16(FP), R7 + SRD $7, R7 + SLD $7, R7 + + ADD R6, R7, R8 + MOVD R8, 640(R1) + CMP R6, R8 + BEQ end + + MOVD dig+0(FP), R9 + MOVD (0*8)(R9), R22 // a = H0 + MOVD (1*8)(R9), R23 // b = H1 + MOVD (2*8)(R9), R24 // c = H2 + MOVD (3*8)(R9), R25 // d = H3 + MOVD (4*8)(R9), R26 // e = H4 + MOVD (5*8)(R9), R27 // f = H5 + MOVD (6*8)(R9), R28 // g = H6 + MOVD (7*8)(R9), R29 // h = H7 + +loop: + MOVD R1, R9 // R9: message schedule + + SHA512ROUND0(0, 0x428a2f98d728ae22, R22, R23, R24, R25, R26, R27, R28, R29) + SHA512ROUND0(1, 0x7137449123ef65cd, R29, R22, R23, R24, R25, R26, R27, R28) + SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R28, R29, R22, R23, R24, R25, R26, R27) + SHA512ROUND0(3, 0xe9b5dba58189dbbc, R27, R28, R29, R22, R23, R24, R25, R26) + SHA512ROUND0(4, 0x3956c25bf348b538, R26, R27, R28, R29, R22, R23, R24, R25) + SHA512ROUND0(5, 0x59f111f1b605d019, R25, R26, R27, R28, R29, R22, R23, R24) + SHA512ROUND0(6, 0x923f82a4af194f9b, R24, R25, R26, R27, R28, R29, R22, R23) + SHA512ROUND0(7, 0xab1c5ed5da6d8118, R23, R24, R25, R26, R27, R28, R29, R22) + SHA512ROUND0(8, 0xd807aa98a3030242, R22, R23, R24, R25, R26, R27, R28, R29) + SHA512ROUND0(9, 0x12835b0145706fbe, R29, R22, R23, R24, R25, R26, R27, R28) + SHA512ROUND0(10, 0x243185be4ee4b28c, R28, R29, R22, R23, R24, R25, R26, R27) + SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R27, R28, R29, R22, R23, R24, R25, R26) + SHA512ROUND0(12, 0x72be5d74f27b896f, R26, R27, R28, R29, R22, R23, R24, R25) + SHA512ROUND0(13, 0x80deb1fe3b1696b1, R25, R26, R27, R28, R29, R22, R23, R24) + SHA512ROUND0(14, 0x9bdc06a725c71235, R24, R25, R26, R27, R28, R29, R22, R23) + SHA512ROUND0(15, 0xc19bf174cf692694, R23, R24, R25, R26, R27, R28, R29, R22) + + SHA512ROUND1(16, 0xe49b69c19ef14ad2, R22, R23, R24, R25, R26, R27, R28, R29) + SHA512ROUND1(17, 0xefbe4786384f25e3, R29, R22, R23, R24, R25, R26, R27, R28) + SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R28, R29, R22, R23, R24, R25, R26, R27) + SHA512ROUND1(19, 0x240ca1cc77ac9c65, R27, R28, R29, R22, R23, R24, R25, R26) + SHA512ROUND1(20, 0x2de92c6f592b0275, R26, R27, R28, R29, R22, R23, R24, R25) + SHA512ROUND1(21, 0x4a7484aa6ea6e483, R25, R26, R27, R28, R29, R22, R23, R24) + SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R24, R25, R26, R27, R28, R29, R22, R23) + SHA512ROUND1(23, 0x76f988da831153b5, R23, R24, R25, R26, R27, R28, R29, R22) + SHA512ROUND1(24, 0x983e5152ee66dfab, R22, R23, R24, R25, R26, R27, R28, R29) + SHA512ROUND1(25, 0xa831c66d2db43210, R29, R22, R23, R24, R25, R26, R27, R28) + SHA512ROUND1(26, 0xb00327c898fb213f, R28, R29, R22, R23, R24, R25, R26, R27) + SHA512ROUND1(27, 0xbf597fc7beef0ee4, R27, R28, R29, R22, R23, R24, R25, R26) + SHA512ROUND1(28, 0xc6e00bf33da88fc2, R26, R27, R28, R29, R22, R23, R24, R25) + SHA512ROUND1(29, 0xd5a79147930aa725, R25, R26, R27, R28, R29, R22, R23, R24) + SHA512ROUND1(30, 0x06ca6351e003826f, R24, R25, R26, R27, R28, R29, R22, R23) + SHA512ROUND1(31, 0x142929670a0e6e70, R23, R24, R25, R26, R27, R28, R29, R22) + SHA512ROUND1(32, 0x27b70a8546d22ffc, R22, R23, R24, R25, R26, R27, R28, R29) + SHA512ROUND1(33, 0x2e1b21385c26c926, R29, R22, R23, R24, R25, R26, R27, R28) + SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R28, R29, R22, R23, R24, R25, R26, R27) + SHA512ROUND1(35, 0x53380d139d95b3df, R27, R28, R29, R22, R23, R24, R25, R26) + SHA512ROUND1(36, 0x650a73548baf63de, R26, R27, R28, R29, R22, R23, R24, R25) + SHA512ROUND1(37, 0x766a0abb3c77b2a8, R25, R26, R27, R28, R29, R22, R23, R24) + SHA512ROUND1(38, 0x81c2c92e47edaee6, R24, R25, R26, R27, R28, R29, R22, R23) + SHA512ROUND1(39, 0x92722c851482353b, R23, R24, R25, R26, R27, R28, R29, R22) + SHA512ROUND1(40, 0xa2bfe8a14cf10364, R22, R23, R24, R25, R26, R27, R28, R29) + SHA512ROUND1(41, 0xa81a664bbc423001, R29, R22, R23, R24, R25, R26, R27, R28) + SHA512ROUND1(42, 0xc24b8b70d0f89791, R28, R29, R22, R23, R24, R25, R26, R27) + SHA512ROUND1(43, 0xc76c51a30654be30, R27, R28, R29, R22, R23, R24, R25, R26) + SHA512ROUND1(44, 0xd192e819d6ef5218, R26, R27, R28, R29, R22, R23, R24, R25) + SHA512ROUND1(45, 0xd69906245565a910, R25, R26, R27, R28, R29, R22, R23, R24) + SHA512ROUND1(46, 0xf40e35855771202a, R24, R25, R26, R27, R28, R29, R22, R23) + SHA512ROUND1(47, 0x106aa07032bbd1b8, R23, R24, R25, R26, R27, R28, R29, R22) + SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R22, R23, R24, R25, R26, R27, R28, R29) + SHA512ROUND1(49, 0x1e376c085141ab53, R29, R22, R23, R24, R25, R26, R27, R28) + SHA512ROUND1(50, 0x2748774cdf8eeb99, R28, R29, R22, R23, R24, R25, R26, R27) + SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R27, R28, R29, R22, R23, R24, R25, R26) + SHA512ROUND1(52, 0x391c0cb3c5c95a63, R26, R27, R28, R29, R22, R23, R24, R25) + SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R25, R26, R27, R28, R29, R22, R23, R24) + SHA512ROUND1(54, 0x5b9cca4f7763e373, R24, R25, R26, R27, R28, R29, R22, R23) + SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R23, R24, R25, R26, R27, R28, R29, R22) + SHA512ROUND1(56, 0x748f82ee5defb2fc, R22, R23, R24, R25, R26, R27, R28, R29) + SHA512ROUND1(57, 0x78a5636f43172f60, R29, R22, R23, R24, R25, R26, R27, R28) + SHA512ROUND1(58, 0x84c87814a1f0ab72, R28, R29, R22, R23, R24, R25, R26, R27) + SHA512ROUND1(59, 0x8cc702081a6439ec, R27, R28, R29, R22, R23, R24, R25, R26) + SHA512ROUND1(60, 0x90befffa23631e28, R26, R27, R28, R29, R22, R23, R24, R25) + SHA512ROUND1(61, 0xa4506cebde82bde9, R25, R26, R27, R28, R29, R22, R23, R24) + SHA512ROUND1(62, 0xbef9a3f7b2c67915, R24, R25, R26, R27, R28, R29, R22, R23) + SHA512ROUND1(63, 0xc67178f2e372532b, R23, R24, R25, R26, R27, R28, R29, R22) + SHA512ROUND1(64, 0xca273eceea26619c, R22, R23, R24, R25, R26, R27, R28, R29) + SHA512ROUND1(65, 0xd186b8c721c0c207, R29, R22, R23, R24, R25, R26, R27, R28) + SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R28, R29, R22, R23, R24, R25, R26, R27) + SHA512ROUND1(67, 0xf57d4f7fee6ed178, R27, R28, R29, R22, R23, R24, R25, R26) + SHA512ROUND1(68, 0x06f067aa72176fba, R26, R27, R28, R29, R22, R23, R24, R25) + SHA512ROUND1(69, 0x0a637dc5a2c898a6, R25, R26, R27, R28, R29, R22, R23, R24) + SHA512ROUND1(70, 0x113f9804bef90dae, R24, R25, R26, R27, R28, R29, R22, R23) + SHA512ROUND1(71, 0x1b710b35131c471b, R23, R24, R25, R26, R27, R28, R29, R22) + SHA512ROUND1(72, 0x28db77f523047d84, R22, R23, R24, R25, R26, R27, R28, R29) + SHA512ROUND1(73, 0x32caab7b40c72493, R29, R22, R23, R24, R25, R26, R27, R28) + SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R28, R29, R22, R23, R24, R25, R26, R27) + SHA512ROUND1(75, 0x431d67c49c100d4c, R27, R28, R29, R22, R23, R24, R25, R26) + SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R26, R27, R28, R29, R22, R23, R24, R25) + SHA512ROUND1(77, 0x597f299cfc657e2a, R25, R26, R27, R28, R29, R22, R23, R24) + SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R24, R25, R26, R27, R28, R29, R22, R23) + SHA512ROUND1(79, 0x6c44198c4a475817, R23, R24, R25, R26, R27, R28, R29, R22) + + MOVD dig+0(FP), R9 + MOVD (0*8)(R9), R21 + ADD R21, R22 // H0 = a + H0 + MOVD R22, (0*8)(R9) + MOVD (1*8)(R9), R21 + ADD R21, R23 // H1 = b + H1 + MOVD R23, (1*8)(R9) + MOVD (2*8)(R9), R21 + ADD R21, R24 // H2 = c + H2 + MOVD R24, (2*8)(R9) + MOVD (3*8)(R9), R21 + ADD R21, R25 // H3 = d + H3 + MOVD R25, (3*8)(R9) + MOVD (4*8)(R9), R21 + ADD R21, R26 // H4 = e + H4 + MOVD R26, (4*8)(R9) + MOVD (5*8)(R9), R21 + ADD R21, R27 // H5 = f + H5 + MOVD R27, (5*8)(R9) + MOVD (6*8)(R9), R21 + ADD R21, R28 // H6 = g + H6 + MOVD R28, (6*8)(R9) + MOVD (7*8)(R9), R21 + ADD R21, R29 // H7 = h + H7 + MOVD R29, (7*8)(R9) + + ADD $128, R6 + MOVD 640(R1), R21 + CMPU R6, R21 + BLT loop + +end: + RET -- 2.48.1