From: Lynn Boger Date: Mon, 28 Feb 2022 21:36:11 +0000 (-0600) Subject: crypto/sha512: add BE support to PPC64 asm implementation X-Git-Tag: go1.19beta1~1191 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=fc5b64e19b8e9719b88fd1a8e3a9fa033c5bc1b4;p=gostls13.git crypto/sha512: add BE support to PPC64 asm implementation This adds big endian support for the assembly implementation of sha512. There was a recent request to do this for sha256 for AIX users; for completeness, the same is being done for sha512. The majority of the code is common between big and little endian with a few differences controlled by ifdefs: with LE the generation of a mask is needed along with VPERM instructions to put bytes in the correct order; some VPERMs need the V registers in a different order. name old time/op new time/op delta Hash8Bytes 1.02µs ± 0% 0.38µs ± 0% -62.68% Hash1K 7.01µs ± 0% 2.43µs ± 0% -65.42% Hash8K 50.2µs ± 0% 14.6µs ± 0% -70.89% Updates #50785 Change-Id: I739b5e7c07b22b5748af11ca781e82ac67adb4f7 Reviewed-on: https://go-review.googlesource.com/c/go/+/388654 Reviewed-by: Cherry Mui Trust: Lynn Boger Run-TryBot: Lynn Boger TryBot-Result: Gopher Robot --- diff --git a/src/crypto/sha512/sha512block_decl.go b/src/crypto/sha512/sha512block_decl.go index c6dcdf5db6..52278ae690 100644 --- a/src/crypto/sha512/sha512block_decl.go +++ b/src/crypto/sha512/sha512block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build s390x || ppc64le +//go:build s390x || ppc64le || ppc64 package sha512 diff --git a/src/crypto/sha512/sha512block_generic.go b/src/crypto/sha512/sha512block_generic.go index 62ea237867..9f0c2f2c5d 100644 --- a/src/crypto/sha512/sha512block_generic.go +++ b/src/crypto/sha512/sha512block_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 && !s390x && !ppc64le +//go:build !amd64 && !s390x && !ppc64le && !ppc64 package sha512 diff --git a/src/crypto/sha512/sha512block_ppc64le.s b/src/crypto/sha512/sha512block_ppc64x.s similarity index 93% rename from src/crypto/sha512/sha512block_ppc64le.s rename to src/crypto/sha512/sha512block_ppc64x.s index 55f0c06c7a..955900b714 100644 --- a/src/crypto/sha512/sha512block_ppc64le.s +++ b/src/crypto/sha512/sha512block_ppc64x.s @@ -10,6 +10,8 @@ // # details see http://www.openssl.org/~appro/cryptogams/. // # ==================================================================== +//go:build ppc64 || ppc64le + #include "textflag.h" // SHA512 block routine. See sha512block.go for Go equivalent. @@ -66,10 +68,6 @@ #define HEX10 R10 #define HEX20 R25 #define HEX30 R26 -#define HEX40 R27 -#define HEX50 R28 -#define HEX60 R29 -#define HEX70 R31 // V0-V7 are A-H // V8-V23 are used for the message schedule @@ -81,6 +79,14 @@ #define s1 V29 #define LEMASK V31 // Permutation control register for little endian +// VPERM is needed on LE to switch the bytes + +#ifdef GOARCH_ppc64le +#define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt +#else +#define VPERMLE(va,vb,vc,vt) +#endif + // 2 copies of each Kt, to fill both doublewords of a vector register DATA ·kcon+0x000(SB)/8, $0x428a2f98d728ae22 DATA ·kcon+0x008(SB)/8, $0x428a2f98d728ae22 @@ -306,15 +312,15 @@ TEXT ·block(SB),0,$128-32 MOVWZ $0x10, HEX10 MOVWZ $0x20, HEX20 MOVWZ $0x30, HEX30 - MOVWZ $0x40, HEX40 - MOVWZ $0x50, HEX50 - MOVWZ $0x60, HEX60 - MOVWZ $0x70, HEX70 +// Generate the mask used with VPERM for LE + +#ifdef GOARCH_ppc64le MOVWZ $8, IDX LVSL (IDX)(R0), LEMASK VSPLTISB $0x0F, KI VXOR KI, LEMASK, LEMASK +#endif LXVD2X (CTX)(HEX00), VS32 // v0 = vs32 LXVD2X (CTX)(HEX10), VS34 // v2 = vs34 @@ -333,62 +339,64 @@ loop: LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance ADD $16, INP - STVX V0, (OFFLOAD+HEX00) - STVX V1, (OFFLOAD+HEX10) - STVX V2, (OFFLOAD+HEX20) - STVX V3, (OFFLOAD+HEX30) - STVX V4, (OFFLOAD+HEX40) - STVX V5, (OFFLOAD+HEX50) - STVX V6, (OFFLOAD+HEX60) - STVX V7, (OFFLOAD+HEX70) + // Copy V0-V7 to VS24-VS31 + + XXLOR V0, V0, VS24 + XXLOR V1, V1, VS25 + XXLOR V2, V2, VS26 + XXLOR V3, V3, VS27 + XXLOR V4, V4, VS28 + XXLOR V5, V5, VS29 + XXLOR V6, V6, VS30 + XXLOR V7, V7, VS31 VADDUDM KI, V7, V7 // h+K[i] LVX (TBL)(IDX), KI ADD $16, IDX - VPERM V8, V8, LEMASK, V8 + VPERMLE(V8,V8,LEMASK,V8) SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8) LXVD2X (INP)(R0), VS42 // load v10 (=vs42) in advance ADD $16, INP, INP VSLDOI $8, V8, V8, V9 SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9) - VPERM V10, V10, LEMASK, V10 + VPERMLE(V10,V10,LEMASK,V10) SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10) LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance ADD $16, INP, INP VSLDOI $8, V10, V10, V11 SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11) - VPERM V12, V12, LEMASK, V12 + VPERMLE(V12,V12,LEMASK,V12) SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12) LXVD2X (INP)(R0), VS46 // load v14 (=vs46) in advance ADD $16, INP, INP VSLDOI $8, V12, V12, V13 SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13) - VPERM V14, V14, LEMASK, V14 + VPERMLE(V14,V14,LEMASK,V14) SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14) LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance ADD $16, INP, INP VSLDOI $8, V14, V14, V15 SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15) - VPERM V16, V16, LEMASK, V16 + VPERMLE(V16,V16,LEMASK,V16) SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16) LXVD2X (INP)(R0), VS50 // load v18 (=vs50) in advance ADD $16, INP, INP VSLDOI $8, V16, V16, V17 SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17) - VPERM V18, V18, LEMASK, V18 + VPERMLE(V18,V18,LEMASK,V18) SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18) LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance ADD $16, INP, INP VSLDOI $8, V18, V18, V19 SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19) - VPERM V20, V20, LEMASK, V20 + VPERMLE(V20,V20,LEMASK,V20) SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20) LXVD2X (INP)(R0), VS54 // load v22 (=vs54) in advance ADD $16, INP, INP VSLDOI $8, V20, V20, V21 SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21) - VPERM V22, V22, LEMASK, V22 + VPERMLE(V22,V22,LEMASK,V22) SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22) VSLDOI $8, V22, V22, V23 SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) @@ -416,31 +424,37 @@ L16_xx: BC 0x10, 0, L16_xx // bdnz - LVX (OFFLOAD)(HEX00), V10 - - LVX (OFFLOAD)(HEX10), V11 + XXLOR VS24, VS24, V10 + XXLOR VS25, VS25, V11 + XXLOR VS26, VS26, V12 + XXLOR VS27, VS27, V13 + XXLOR VS28, VS28, V14 + XXLOR VS29, VS29, V15 + XXLOR VS30, VS30, V16 + XXLOR VS31, VS31, V17 VADDUDM V10, V0, V0 - LVX (OFFLOAD)(HEX20), V12 VADDUDM V11, V1, V1 - LVX (OFFLOAD)(HEX30), V13 VADDUDM V12, V2, V2 - LVX (OFFLOAD)(HEX40), V14 VADDUDM V13, V3, V3 - LVX (OFFLOAD)(HEX50), V15 VADDUDM V14, V4, V4 - LVX (OFFLOAD)(HEX60), V16 VADDUDM V15, V5, V5 - LVX (OFFLOAD)(HEX70), V17 VADDUDM V16, V6, V6 VADDUDM V17, V7, V7 CMPU INP, END BLT loop +#ifdef GOARCH_ppc64le VPERM V0, V1, KI, V0 VPERM V2, V3, KI, V2 VPERM V4, V5, KI, V4 VPERM V6, V7, KI, V6 +#else + VPERM V1, V0, KI, V0 + VPERM V3, V2, KI, V2 + VPERM V5, V4, KI, V4 + VPERM V7, V6, KI, V6 +#endif STXVD2X VS32, (CTX+HEX00) // v0 = vs32 STXVD2X VS34, (CTX+HEX10) // v2 = vs34 STXVD2X VS36, (CTX+HEX20) // v4 = vs36