From: Paul E. Murphy Date: Thu, 12 May 2022 14:45:59 +0000 (-0500) Subject: crypto/sha256, cmd/internal/notsha256: improve PPC64 sha256 X-Git-Tag: go1.20rc1~575 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=6f7e9e23074d771c6adfce84d74fa11dbd8b7619;p=gostls13.git crypto/sha256, cmd/internal/notsha256: improve PPC64 sha256 This minimizes addi usage inside vector heavy loops. This results in a small performance uptick on P9 ppc64le/linux. Likewise, cleanup some minor whitespace issues around comments. The implementation from crypto/sha256 is also shared with notsha256. It is copied, but preserves notsha256's go:build directives. They are otherwise identical now. Previously, bootstrap restrictions required workarounds to support XXLOR on older toolchains. This is not needed anymore as the minimum bootstrap (1.17) compiler will support XXLOR. name old speed new speed delta Hash8Bytes/New 28.8MB/s ± 0% 30.5MB/s ± 0% +5.98% Hash8Bytes/Sum224 29.5MB/s ± 0% 31.3MB/s ± 0% +6.17% Hash8Bytes/Sum256 29.5MB/s ± 0% 31.2MB/s ± 0% +5.80% Hash1K/New 287MB/s ± 0% 312MB/s ± 0% +8.60% Hash1K/Sum224 289MB/s ± 0% 312MB/s ± 0% +7.99% Hash1K/Sum256 289MB/s ± 0% 312MB/s ± 0% +7.98% Hash8K/New 313MB/s ± 0% 338MB/s ± 0% +8.12% Hash8K/Sum224 313MB/s ± 0% 338MB/s ± 0% +8.20% Hash8K/Sum256 313MB/s ± 0% 338MB/s ± 0% +8.12% Change-Id: Ib386d6306673b4e6553ee745ec2e1b53a9722df1 Reviewed-on: https://go-review.googlesource.com/c/go/+/441815 TryBot-Result: Gopher Robot Reviewed-by: Than McIntosh Reviewed-by: Lynn Boger Reviewed-by: David Chase Reviewed-by: Archana Ravindar Run-TryBot: Paul Murphy --- diff --git a/src/cmd/internal/notsha256/sha256block_ppc64x.s b/src/cmd/internal/notsha256/sha256block_ppc64x.s index e907d3b71b..ea4417d6c6 100644 --- a/src/cmd/internal/notsha256/sha256block_ppc64x.s +++ b/src/cmd/internal/notsha256/sha256block_ppc64x.s @@ -65,13 +65,31 @@ #define CTX R3 #define INP R4 #define END R5 -#define TBL R6 -#define IDX R7 +#define TBL R6 // Pointer into kcon table #define LEN R9 #define TEMP R12 -#define HEX00 R0 -#define HEX10 R10 +#define TBL_STRT R7 // Pointer to start of kcon table. + +#define R_x000 R0 +#define R_x010 R8 +#define R_x020 R10 +#define R_x030 R11 +#define R_x040 R14 +#define R_x050 R15 +#define R_x060 R16 +#define R_x070 R17 +#define R_x080 R18 +#define R_x090 R19 +#define R_x0a0 R20 +#define R_x0b0 R21 +#define R_x0c0 R22 +#define R_x0d0 R23 +#define R_x0e0 R24 +#define R_x0f0 R25 +#define R_x100 R26 +#define R_x110 R27 + // V0-V7 are A-H // V8-V23 are used for the message schedule @@ -81,7 +99,7 @@ #define S1 V27 #define s0 V28 #define s1 V29 -#define LEMASK V31 // Permutation control register for little endian +#define LEMASK V31 // Permutation control register for little endian // 4 copies of each Kt, to fill all 4 words of a vector register DATA ·kcon+0x000(SB)/8, $0x428a2f98428a2f98 @@ -216,7 +234,7 @@ DATA ·kcon+0x400(SB)/8, $0x0000000000000000 DATA ·kcon+0x408(SB)/8, $0x0000000000000000 #ifdef GOARCH_ppc64le -DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors +DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors DATA ·kcon+0x418(SB)/8, $0x1011121300010203 DATA ·kcon+0x420(SB)/8, $0x1011121310111213 DATA ·kcon+0x428(SB)/8, $0x0405060700010203 @@ -224,7 +242,7 @@ DATA ·kcon+0x430(SB)/8, $0x1011121308090a0b DATA ·kcon+0x438(SB)/8, $0x0405060700010203 #else DATA ·kcon+0x410(SB)/8, $0x1011121300010203 -DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors +DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors DATA ·kcon+0x420(SB)/8, $0x0405060700010203 DATA ·kcon+0x428(SB)/8, $0x1011121310111213 DATA ·kcon+0x430(SB)/8, $0x0001020304050607 @@ -233,7 +251,7 @@ DATA ·kcon+0x438(SB)/8, $0x08090a0b10111213 GLOBL ·kcon(SB), RODATA, $1088 -#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \ +#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi, idx) \ VSEL g, f, e, FUNC; \ VSHASIGMAW $15, e, $1, S1; \ VADDUWM xi, h, h; \ @@ -245,11 +263,10 @@ GLOBL ·kcon(SB), RODATA, $1088 VADDUWM KI, g, g; \ VADDUWM h, d, d; \ VADDUWM FUNC, S0, S0; \ - LVX (TBL)(IDX), KI; \ - ADD $16, IDX; \ + LVX (TBL)(idx), KI; \ VADDUWM S0, h, h -#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \ +#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \ VSHASIGMAW $0, xj_1, $0, s0; \ VSEL g, f, e, FUNC; \ VSHASIGMAW $15, e, $1, S1; \ @@ -265,8 +282,7 @@ GLOBL ·kcon(SB), RODATA, $1088 VADDUWM h, d, d; \ VADDUWM FUNC, S0, S0; \ VADDUWM s0, xj, xj; \ - LVX (TBL)(IDX), KI; \ - ADD $16, IDX; \ + LVX (TBL)(idx), KI; \ VADDUWM S0, h, h; \ VADDUWM s1, xj, xj @@ -289,18 +305,18 @@ TEXT ·block(SB),0,$0-32 CMP INP, END BEQ end - MOVD $·kcon(SB), TBL - MOVWZ $0x10, HEX10 - MOVWZ $8, IDX + MOVD $·kcon(SB), TBL_STRT + MOVD $0x10, R_x010 #ifdef GOARCH_ppc64le - LVSL (IDX)(R0), LEMASK + MOVWZ $8, TEMP + LVSL (TEMP)(R0), LEMASK VSPLTISB $0x0F, KI VXOR KI, LEMASK, LEMASK #endif - LXVW4X (CTX)(HEX00), VS32 // v0 = vs32 - LXVW4X (CTX)(HEX10), VS36 // v4 = vs36 + LXVW4X (CTX)(R_x000), V0 + LXVW4X (CTX)(R_x010), V4 // unpack the input values into vector registers VSLDOI $4, V0, V0, V1 @@ -310,121 +326,135 @@ TEXT ·block(SB),0,$0-32 VSLDOI $8, V4, V4, V6 VSLDOI $12, V4, V4, V7 + MOVD $0x020, R_x020 + MOVD $0x030, R_x030 + MOVD $0x040, R_x040 + MOVD $0x050, R_x050 + MOVD $0x060, R_x060 + MOVD $0x070, R_x070 + MOVD $0x080, R_x080 + MOVD $0x090, R_x090 + MOVD $0x0a0, R_x0a0 + MOVD $0x0b0, R_x0b0 + MOVD $0x0c0, R_x0c0 + MOVD $0x0d0, R_x0d0 + MOVD $0x0e0, R_x0e0 + MOVD $0x0f0, R_x0f0 + MOVD $0x100, R_x100 + MOVD $0x110, R_x110 + loop: - LVX (TBL)(HEX00), KI - MOVWZ $16, IDX + MOVD TBL_STRT, TBL + LVX (TBL)(R_x000), KI - LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance - ADD $16, INP + LXVD2X (INP)(R_x000), V8 // load v8 in advance // Offload to VSR24-31 (aka FPR24-31) - XXLORQ VS32, VS32, VS24 - XXLORQ VS33, VS33, VS25 - XXLORQ VS34, VS34, VS26 - XXLORQ VS35, VS35, VS27 - XXLORQ VS36, VS36, VS28 - XXLORQ VS37, VS37, VS29 - XXLORQ VS38, VS38, VS30 - XXLORQ VS39, VS39, VS31 - - VADDUWM KI, V7, V7 // h+K[i] - LVX (TBL)(IDX), KI - ADD $16, IDX + XXLOR V0, V0, VS24 + XXLOR V1, V1, VS25 + XXLOR V2, V2, VS26 + XXLOR V3, V3, VS27 + XXLOR V4, V4, VS28 + XXLOR V5, V5, VS29 + XXLOR V6, V6, VS30 + XXLOR V7, V7, VS31 + + VADDUWM KI, V7, V7 // h+K[i] + LVX (TBL)(R_x010), KI VPERMLE(V8, V8, LEMASK, V8) - SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8) + SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020) VSLDOI $4, V8, V8, V9 - SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9) + SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030) VSLDOI $4, V9, V9, V10 - SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10) - LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance - ADD $16, INP, INP + SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040) + LXVD2X (INP)(R_x010), V12 // load v12 in advance VSLDOI $4, V10, V10, V11 - SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11) + SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050) VPERMLE(V12, V12, LEMASK, V12) - SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12) + SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060) VSLDOI $4, V12, V12, V13 - SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13) + SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070) VSLDOI $4, V13, V13, V14 - SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14) - LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance - ADD $16, INP, INP + SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080) + LXVD2X (INP)(R_x020), V16 // load v16 in advance VSLDOI $4, V14, V14, V15 - SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15) + SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090) VPERMLE(V16, V16, LEMASK, V16) - SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16) + SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0) VSLDOI $4, V16, V16, V17 - SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17) + SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0) VSLDOI $4, V17, V17, V18 - SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18) + SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0) VSLDOI $4, V18, V18, V19 - LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance - ADD $16, INP, INP - SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19) + LXVD2X (INP)(R_x030), V20 // load v20 in advance + SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0) VPERMLE(V20, V20, LEMASK, V20) - SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20) + SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0) VSLDOI $4, V20, V20, V21 - SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21) + SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0) VSLDOI $4, V21, V21, V22 - SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22) + SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100) VSLDOI $4, V22, V22, V23 - SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) + SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110) - MOVWZ $3, TEMP - MOVWZ TEMP, CTR + MOVD $3, TEMP + MOVD TEMP, CTR + ADD $0x120, TBL + ADD $0x40, INP L16_xx: - SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23) - SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8) - SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9) - SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10) - SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11) - SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12) - SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13) - SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14) - SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15) - SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16) - SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17) - SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18) - SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19) - SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20) - SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21) - SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) - - BC 0x10, 0, L16_xx // bdnz - - XXLORQ VS24, VS24, VS42 - - XXLORQ VS25, VS25, VS43 + SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000) + SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010) + SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020) + SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030) + SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040) + SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050) + SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060) + SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070) + SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080) + SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090) + SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0) + SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0) + SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0) + SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0) + SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0) + SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0) + ADD $0x100, TBL + + BDNZ L16_xx + + XXLOR VS24, VS24, V10 + + XXLOR VS25, VS25, V11 VADDUWM V10, V0, V0 - XXLORQ VS26, VS26, VS44 + XXLOR VS26, VS26, V12 VADDUWM V11, V1, V1 - XXLORQ VS27, VS27, VS45 + XXLOR VS27, VS27, V13 VADDUWM V12, V2, V2 - XXLORQ VS28, VS28, VS46 + XXLOR VS28, VS28, V14 VADDUWM V13, V3, V3 - XXLORQ VS29, VS29, VS47 + XXLOR VS29, VS29, V15 VADDUWM V14, V4, V4 - XXLORQ VS30, VS30, VS48 + XXLOR VS30, VS30, V16 VADDUWM V15, V5, V5 - XXLORQ VS31, VS31, VS49 + XXLOR VS31, VS31, V17 VADDUWM V16, V6, V6 VADDUWM V17, V7, V7 CMPU INP, END BLT loop - LVX (TBL)(IDX), V8 - ADD $16, IDX + LVX (TBL)(R_x000), V8 VPERM V0, V1, KI, V0 - LVX (TBL)(IDX), V9 + LVX (TBL)(R_x010), V9 VPERM V4, V5, KI, V4 VPERM V0, V2, V8, V0 VPERM V4, V6, V8, V4 VPERM V0, V3, V9, V0 VPERM V4, V7, V9, V4 - STXVD2X VS32, (CTX+HEX00) // v0 = vs32 - STXVD2X VS36, (CTX+HEX10) // v4 = vs36 + STXVD2X V0, (CTX+R_x000) + STXVD2X V4, (CTX+R_x010) end: RET diff --git a/src/crypto/sha256/sha256block_ppc64x.s b/src/crypto/sha256/sha256block_ppc64x.s index 617d42e1d7..b229ef619a 100644 --- a/src/crypto/sha256/sha256block_ppc64x.s +++ b/src/crypto/sha256/sha256block_ppc64x.s @@ -57,13 +57,31 @@ #define CTX R3 #define INP R4 #define END R5 -#define TBL R6 -#define IDX R7 +#define TBL R6 // Pointer into kcon table #define LEN R9 #define TEMP R12 -#define HEX00 R0 -#define HEX10 R10 +#define TBL_STRT R7 // Pointer to start of kcon table. + +#define R_x000 R0 +#define R_x010 R8 +#define R_x020 R10 +#define R_x030 R11 +#define R_x040 R14 +#define R_x050 R15 +#define R_x060 R16 +#define R_x070 R17 +#define R_x080 R18 +#define R_x090 R19 +#define R_x0a0 R20 +#define R_x0b0 R21 +#define R_x0c0 R22 +#define R_x0d0 R23 +#define R_x0e0 R24 +#define R_x0f0 R25 +#define R_x100 R26 +#define R_x110 R27 + // V0-V7 are A-H // V8-V23 are used for the message schedule @@ -73,7 +91,7 @@ #define S1 V27 #define s0 V28 #define s1 V29 -#define LEMASK V31 // Permutation control register for little endian +#define LEMASK V31 // Permutation control register for little endian // 4 copies of each Kt, to fill all 4 words of a vector register DATA ·kcon+0x000(SB)/8, $0x428a2f98428a2f98 @@ -208,7 +226,7 @@ DATA ·kcon+0x400(SB)/8, $0x0000000000000000 DATA ·kcon+0x408(SB)/8, $0x0000000000000000 #ifdef GOARCH_ppc64le -DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors +DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors DATA ·kcon+0x418(SB)/8, $0x1011121300010203 DATA ·kcon+0x420(SB)/8, $0x1011121310111213 DATA ·kcon+0x428(SB)/8, $0x0405060700010203 @@ -216,7 +234,7 @@ DATA ·kcon+0x430(SB)/8, $0x1011121308090a0b DATA ·kcon+0x438(SB)/8, $0x0405060700010203 #else DATA ·kcon+0x410(SB)/8, $0x1011121300010203 -DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors +DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors DATA ·kcon+0x420(SB)/8, $0x0405060700010203 DATA ·kcon+0x428(SB)/8, $0x1011121310111213 DATA ·kcon+0x430(SB)/8, $0x0001020304050607 @@ -225,7 +243,7 @@ DATA ·kcon+0x438(SB)/8, $0x08090a0b10111213 GLOBL ·kcon(SB), RODATA, $1088 -#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \ +#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi, idx) \ VSEL g, f, e, FUNC; \ VSHASIGMAW $15, e, $1, S1; \ VADDUWM xi, h, h; \ @@ -237,11 +255,10 @@ GLOBL ·kcon(SB), RODATA, $1088 VADDUWM KI, g, g; \ VADDUWM h, d, d; \ VADDUWM FUNC, S0, S0; \ - LVX (TBL)(IDX), KI; \ - ADD $16, IDX; \ + LVX (TBL)(idx), KI; \ VADDUWM S0, h, h -#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \ +#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \ VSHASIGMAW $0, xj_1, $0, s0; \ VSEL g, f, e, FUNC; \ VSHASIGMAW $15, e, $1, S1; \ @@ -257,8 +274,7 @@ GLOBL ·kcon(SB), RODATA, $1088 VADDUWM h, d, d; \ VADDUWM FUNC, S0, S0; \ VADDUWM s0, xj, xj; \ - LVX (TBL)(IDX), KI; \ - ADD $16, IDX; \ + LVX (TBL)(idx), KI; \ VADDUWM S0, h, h; \ VADDUWM s1, xj, xj @@ -281,18 +297,18 @@ TEXT ·block(SB),0,$0-32 CMP INP, END BEQ end - MOVD $·kcon(SB), TBL - MOVWZ $0x10, HEX10 - MOVWZ $8, IDX + MOVD $·kcon(SB), TBL_STRT + MOVD $0x10, R_x010 #ifdef GOARCH_ppc64le - LVSL (IDX)(R0), LEMASK + MOVWZ $8, TEMP + LVSL (TEMP)(R0), LEMASK VSPLTISB $0x0F, KI VXOR KI, LEMASK, LEMASK #endif - LXVW4X (CTX)(HEX00), VS32 // v0 = vs32 - LXVW4X (CTX)(HEX10), VS36 // v4 = vs36 + LXVW4X (CTX)(R_x000), V0 + LXVW4X (CTX)(R_x010), V4 // unpack the input values into vector registers VSLDOI $4, V0, V0, V1 @@ -302,12 +318,28 @@ TEXT ·block(SB),0,$0-32 VSLDOI $8, V4, V4, V6 VSLDOI $12, V4, V4, V7 + MOVD $0x020, R_x020 + MOVD $0x030, R_x030 + MOVD $0x040, R_x040 + MOVD $0x050, R_x050 + MOVD $0x060, R_x060 + MOVD $0x070, R_x070 + MOVD $0x080, R_x080 + MOVD $0x090, R_x090 + MOVD $0x0a0, R_x0a0 + MOVD $0x0b0, R_x0b0 + MOVD $0x0c0, R_x0c0 + MOVD $0x0d0, R_x0d0 + MOVD $0x0e0, R_x0e0 + MOVD $0x0f0, R_x0f0 + MOVD $0x100, R_x100 + MOVD $0x110, R_x110 + loop: - LVX (TBL)(HEX00), KI - MOVWZ $16, IDX + MOVD TBL_STRT, TBL + LVX (TBL)(R_x000), KI - LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance - ADD $16, INP + LXVD2X (INP)(R_x000), V8 // load v8 in advance // Offload to VSR24-31 (aka FPR24-31) XXLOR V0, V0, VS24 @@ -319,71 +351,70 @@ loop: XXLOR V6, V6, VS30 XXLOR V7, V7, VS31 - VADDUWM KI, V7, V7 // h+K[i] - LVX (TBL)(IDX), KI - ADD $16, IDX + VADDUWM KI, V7, V7 // h+K[i] + LVX (TBL)(R_x010), KI VPERMLE(V8, V8, LEMASK, V8) - SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8) + SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020) VSLDOI $4, V8, V8, V9 - SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9) + SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030) VSLDOI $4, V9, V9, V10 - SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10) - LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance - ADD $16, INP, INP + SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040) + LXVD2X (INP)(R_x010), V12 // load v12 in advance VSLDOI $4, V10, V10, V11 - SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11) + SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050) VPERMLE(V12, V12, LEMASK, V12) - SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12) + SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060) VSLDOI $4, V12, V12, V13 - SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13) + SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070) VSLDOI $4, V13, V13, V14 - SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14) - LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance - ADD $16, INP, INP + SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080) + LXVD2X (INP)(R_x020), V16 // load v16 in advance VSLDOI $4, V14, V14, V15 - SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15) + SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090) VPERMLE(V16, V16, LEMASK, V16) - SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16) + SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0) VSLDOI $4, V16, V16, V17 - SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17) + SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0) VSLDOI $4, V17, V17, V18 - SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18) + SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0) VSLDOI $4, V18, V18, V19 - LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance - ADD $16, INP, INP - SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19) + LXVD2X (INP)(R_x030), V20 // load v20 in advance + SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0) VPERMLE(V20, V20, LEMASK, V20) - SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20) + SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0) VSLDOI $4, V20, V20, V21 - SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21) + SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0) VSLDOI $4, V21, V21, V22 - SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22) + SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100) VSLDOI $4, V22, V22, V23 - SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) + SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110) - MOVWZ $3, TEMP - MOVWZ TEMP, CTR + MOVD $3, TEMP + MOVD TEMP, CTR + ADD $0x120, TBL + ADD $0x40, INP L16_xx: - SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23) - SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8) - SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9) - SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10) - SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11) - SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12) - SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13) - SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14) - SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15) - SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16) - SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17) - SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18) - SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19) - SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20) - SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21) - SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) - - BC 0x10, 0, L16_xx // bdnz + SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000) + SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010) + SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020) + SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030) + SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040) + SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050) + SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060) + SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070) + SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080) + SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090) + SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0) + SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0) + SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0) + SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0) + SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0) + SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0) + ADD $0x100, TBL + + BDNZ L16_xx XXLOR VS24, VS24, V10 @@ -406,17 +437,16 @@ L16_xx: CMPU INP, END BLT loop - LVX (TBL)(IDX), V8 - ADD $16, IDX + LVX (TBL)(R_x000), V8 VPERM V0, V1, KI, V0 - LVX (TBL)(IDX), V9 + LVX (TBL)(R_x010), V9 VPERM V4, V5, KI, V4 VPERM V0, V2, V8, V0 VPERM V4, V6, V8, V4 VPERM V0, V3, V9, V0 VPERM V4, V7, V9, V4 - STXVD2X VS32, (CTX+HEX00) // v0 = vs32 - STXVD2X VS36, (CTX+HEX10) // v4 = vs36 + STXVD2X V0, (CTX+R_x000) + STXVD2X V4, (CTX+R_x010) end: RET