]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/sha256: optimize arm64 sha256 implemention
authorfanzha02 <fannie.zhang@arm.com>
Fri, 28 Jul 2017 04:14:45 +0000 (04:14 +0000)
committerBrad Fitzpatrick <bradfitz@golang.org>
Tue, 14 Nov 2017 18:18:02 +0000 (18:18 +0000)
Optimize with ARMv8 SHA256 instructions.
Result (Cortex-A72)

name           old time/op    new time/op      delta
Hash8Bytes-64    1.54µs ± 1%      0.61µs ± 9%    -60.67%  (p=0.008 n=5+5)
Hash1K-64        17.2µs ± 1%       1.4µs ± 2%    -91.91%  (p=0.008 n=5+5)
Hash8K-64         127µs ± 0%         7µs ± 1%    -94.42%  (p=0.008 n=5+5)

name           old speed      new speed        delta
Hash8Bytes-64  5.20MB/s ± 1%   13.23MB/s ±10%   +154.58%  (p=0.008 n=5+5)
Hash1K-64      59.4MB/s ± 1%   735.1MB/s ± 2%  +1136.96%  (p=0.008 n=5+5)
Hash8K-64      64.5MB/s ± 0%  1156.3MB/s ± 1%  +1692.75%  (p=0.008 n=5+5)

Change-Id: I47eca6471b75cd07cb0c77477053a07d0de7494f
Reviewed-on: https://go-review.googlesource.com/61570
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>

src/crypto/sha256/sha256block_arm64.go [new file with mode: 0644]
src/crypto/sha256/sha256block_arm64.s [new file with mode: 0644]
src/crypto/sha256/sha256block_generic.go

diff --git a/src/crypto/sha256/sha256block_arm64.go b/src/crypto/sha256/sha256block_arm64.go
new file mode 100644 (file)
index 0000000..48c436b
--- /dev/null
@@ -0,0 +1,22 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha256
+
+import "internal/cpu"
+
+var k = _K
+
+var hasSHA2 = cpu.ARM64.HasSHA2
+
+func sha256block(h []uint32, p []byte, k []uint32)
+
+func block(dig *digest, p []byte) {
+       if !hasSHA2 {
+               blockGeneric(dig, p)
+       } else {
+               h := dig.h[:]
+               sha256block(h, p, k)
+       }
+}
diff --git a/src/crypto/sha256/sha256block_arm64.s b/src/crypto/sha256/sha256block_arm64.s
new file mode 100644 (file)
index 0000000..a88bfa1
--- /dev/null
@@ -0,0 +1,119 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define HASHUPDATE \
+       SHA256H V9.S4, V3, V2 \
+       SHA256H2        V9.S4, V8, V3 \
+       VMOV    V2.B16, V8.B16
+
+// func sha256block(h []uint32, p []byte, k []uint32)
+TEXT ·sha256block(SB),NOSPLIT,$0
+       MOVD    h_base+0(FP), R0                           // Hash value fisrt address
+       MOVD    p_base+24(FP), R1                          // message first address
+       MOVD    k_base+48(FP), R2                          // k constants first address
+       MOVD    p_len+32(FP), R3                           // message length
+       VLD1    (R0), [V0.S4, V1.S4]                       // load h(a,b,c,d,e,f,g,h)
+       VLD1.P  64(R2), [V16.S4, V17.S4, V18.S4, V19.S4]
+       VLD1.P  64(R2), [V20.S4, V21.S4, V22.S4, V23.S4]
+       VLD1.P  64(R2), [V24.S4, V25.S4, V26.S4, V27.S4]
+       VLD1    (R2), [V28.S4, V29.S4, V30.S4, V31.S4]     //load 64*4bytes K constant(K0-K63)
+
+blockloop:
+
+       VLD1.P  16(R1), [V4.B16]                            // load 16bytes message
+       VLD1.P  16(R1), [V5.B16]                            // load 16bytes message
+       VLD1.P  16(R1), [V6.B16]                            // load 16bytes message
+       VLD1.P  16(R1), [V7.B16]                            // load 16bytes message
+       VMOV    V0.B16, V2.B16                              // backup: VO h(dcba)
+       VMOV    V1.B16, V3.B16                              // backup: V1 h(hgfe)
+       VMOV    V2.B16, V8.B16
+       VREV32  V4.B16, V4.B16                              // prepare for using message in Byte format
+       VREV32  V5.B16, V5.B16
+       VREV32  V6.B16, V6.B16
+       VREV32  V7.B16, V7.B16
+
+       VADD    V16.S4, V4.S4, V9.S4                        // V18(W0+K0...W3+K3)
+       SHA256SU0       V5.S4, V4.S4                        // V4: (su0(W1)+W0,...,su0(W4)+W3)
+       HASHUPDATE                                          // H4
+
+       VADD    V17.S4, V5.S4, V9.S4                        // V18(W4+K4...W7+K7)
+       SHA256SU0       V6.S4, V5.S4                        // V5: (su0(W5)+W4,...,su0(W8)+W7)
+       SHA256SU1       V7.S4, V6.S4, V4.S4                 // V4: W16-W19
+       HASHUPDATE                                          // H8
+
+       VADD    V18.S4, V6.S4, V9.S4                        // V18(W8+K8...W11+K11)
+       SHA256SU0       V7.S4, V6.S4                        // V6: (su0(W9)+W8,...,su0(W12)+W11)
+       SHA256SU1       V4.S4, V7.S4, V5.S4                 // V5: W20-W23
+       HASHUPDATE                                          // H12
+
+       VADD    V19.S4, V7.S4, V9.S4                        // V18(W12+K12...W15+K15)
+       SHA256SU0       V4.S4, V7.S4                        // V7: (su0(W13)+W12,...,su0(W16)+W15)
+       SHA256SU1       V5.S4, V4.S4, V6.S4                 // V6: W24-W27
+       HASHUPDATE                                          // H16
+
+       VADD    V20.S4, V4.S4, V9.S4                        // V18(W16+K16...W19+K19)
+       SHA256SU0       V5.S4, V4.S4                        // V4: (su0(W17)+W16,...,su0(W20)+W19)
+       SHA256SU1       V6.S4, V5.S4, V7.S4                 // V7: W28-W31
+       HASHUPDATE                                          // H20
+
+       VADD    V21.S4, V5.S4, V9.S4                        // V18(W20+K20...W23+K23)
+       SHA256SU0       V6.S4, V5.S4                        // V5: (su0(W21)+W20,...,su0(W24)+W23)
+       SHA256SU1       V7.S4, V6.S4, V4.S4                 // V4: W32-W35
+       HASHUPDATE                                          // H24
+
+       VADD    V22.S4, V6.S4, V9.S4                        // V18(W24+K24...W27+K27)
+       SHA256SU0       V7.S4, V6.S4                        // V6: (su0(W25)+W24,...,su0(W28)+W27)
+       SHA256SU1       V4.S4, V7.S4, V5.S4                 // V5: W36-W39
+       HASHUPDATE                                          // H28
+
+       VADD    V23.S4, V7.S4, V9.S4                        // V18(W28+K28...W31+K31)
+       SHA256SU0       V4.S4, V7.S4                        // V7: (su0(W29)+W28,...,su0(W32)+W31)
+       SHA256SU1       V5.S4, V4.S4, V6.S4                 // V6: W40-W43
+       HASHUPDATE                                          // H32
+
+       VADD    V24.S4, V4.S4, V9.S4                        // V18(W32+K32...W35+K35)
+       SHA256SU0       V5.S4, V4.S4                        // V4: (su0(W33)+W32,...,su0(W36)+W35)
+       SHA256SU1       V6.S4, V5.S4, V7.S4                 // V7: W44-W47
+       HASHUPDATE                                          // H36
+
+       VADD    V25.S4, V5.S4, V9.S4                        // V18(W36+K36...W39+K39)
+       SHA256SU0       V6.S4, V5.S4                        // V5: (su0(W37)+W36,...,su0(W40)+W39)
+       SHA256SU1       V7.S4, V6.S4, V4.S4                 // V4: W48-W51
+       HASHUPDATE                                          // H40
+
+       VADD    V26.S4, V6.S4, V9.S4                        // V18(W40+K40...W43+K43)
+       SHA256SU0       V7.S4, V6.S4                        // V6: (su0(W41)+W40,...,su0(W44)+W43)
+       SHA256SU1       V4.S4, V7.S4, V5.S4                 // V5: W52-W55
+       HASHUPDATE                                          // H44
+
+       VADD    V27.S4, V7.S4, V9.S4                        // V18(W44+K44...W47+K47)
+       SHA256SU0       V4.S4, V7.S4                        // V7: (su0(W45)+W44,...,su0(W48)+W47)
+       SHA256SU1       V5.S4, V4.S4, V6.S4                 // V6: W56-W59
+       HASHUPDATE                                          // H48
+
+       VADD    V28.S4, V4.S4, V9.S4                        // V18(W48+K48,...,W51+K51)
+       HASHUPDATE                                          // H52
+       SHA256SU1       V6.S4, V5.S4, V7.S4                 // V7: W60-W63
+
+       VADD    V29.S4, V5.S4, V9.S4                        // V18(W52+K52,...,W55+K55)
+       HASHUPDATE                                          // H56
+
+       VADD    V30.S4, V6.S4, V9.S4                        // V18(W59+K59,...,W59+K59)
+       HASHUPDATE                                          // H60
+
+       VADD    V31.S4, V7.S4, V9.S4                        // V18(W60+K60,...,W63+K63)
+       HASHUPDATE                                          // H64
+
+       SUB     $64, R3, R3                                 // message length - 64bytes, then compare with 64bytes
+       VADD    V2.S4, V0.S4, V0.S4
+       VADD    V3.S4, V1.S4, V1.S4
+       CBNZ    R3, blockloop
+
+sha256ret:
+
+       VST1    [V0.S4, V1.S4], (R0)                       // store hash value H
+       RET
+
index a182a5eacfe9f9dbdd3d03661a0b09bbee0d8360..61362f41260cc6cc2f4fe95d9a214b56ff4da0d0 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!386,!s390x,!ppc64le
+// +build !amd64,!386,!s390x,!ppc64le,!arm64
 
 package sha256