From: Paul E. Murphy Date: Mon, 9 May 2022 15:44:07 +0000 (-0500) Subject: crypto/aes: enable aes-cbc asm for ppc64 X-Git-Tag: go1.19beta1~272 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=b553f51fcd959c9d2d113ce74eba6eac958e2a5b;p=gostls13.git crypto/aes: enable aes-cbc asm for ppc64 This reworks how we load/store vector registers using the new bi-endian P9 instruction emulation macros. This also removes quite a bit of asm used to align and reorder vector registers. This is also a slight improvement on P9 ppc64le/linux: name old speed new speed delta AESCBCEncrypt1K 936MB/s ± 0% 943MB/s ± 0% +0.80% AESCBCDecrypt1K 1.28GB/s ± 0% 1.37GB/s ± 0% +6.76% Updates #18499 Change-Id: Ic5ff71d217d7302b6ae4e8d877c25004bfda5ecd Reviewed-on: https://go-review.googlesource.com/c/go/+/405134 Run-TryBot: Paul Murphy TryBot-Result: Gopher Robot Reviewed-by: Dmitri Shuralyov Reviewed-by: Lynn Boger Reviewed-by: Ian Lance Taylor --- diff --git a/src/crypto/aes/asm_ppc64x.s b/src/crypto/aes/asm_ppc64x.s index 3513f09a93..1ae69d2a2d 100644 --- a/src/crypto/aes/asm_ppc64x.s +++ b/src/crypto/aes/asm_ppc64x.s @@ -513,10 +513,6 @@ Linvalid_key_len: #undef ROUNDS #undef KEY #undef TMP -#undef OUTPERM -#undef OUTMASK -#undef OUTHEAD -#undef OUTTAIL // CBC encrypt or decrypt // R3 src @@ -529,13 +525,9 @@ Linvalid_key_len: // Register usage: // R9: ROUNDS // R10: Index -// V0: initialized to 0 -// V3: initialized to mask // V4: IV // V5: SRC -// V6: IV perm mask // V7: DST -// V10: KEY perm mask #define INP R3 #define OUT R4 @@ -547,17 +539,10 @@ Linvalid_key_len: #define IDX R10 #define RNDKEY0 V0 -#define RNDKEY1 V1 #define INOUT V2 #define TMP V3 #define IVEC V4 -#define INPTAIL V5 -#define INPPERM V6 -#define OUTHEAD V7 -#define OUTPERM V8 -#define OUTMASK V9 -#define KEYPERM V10 // Vector loads are done using LVX followed by // a VPERM using mask generated from previous @@ -588,32 +573,19 @@ TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0 MOVD enc+40(FP), ENC MOVD nr+48(FP), ROUNDS - CMPU LEN, $16 // cmpldi r5,16 - BC 14, 0, LR // bltlr- - CMPW ENC, $0 // cmpwi r8,0 - MOVD $15, IDX // li r10,15 - VXOR RNDKEY0, RNDKEY0, RNDKEY0 // vxor v0,v0,v0 - VSPLTISB $0xf, TMP // vspltisb $0xf,v3 - - LVX (IVP)(R0), IVEC // lvx v4,r0,r7 - LVSL (IVP)(R0), INPPERM // lvsl v6,r0,r7 - LVX (IVP)(IDX), INPTAIL // lvx v5,r10,r7 - VXOR INPPERM, TMP, INPPERM // vxor v3, v6, v6 - VPERM IVEC, INPTAIL, INPPERM, IVEC // vperm v4,v4,v5,v6 - NEG INP, R11 // neg r11,r3 - LVSR (KEY)(R0), KEYPERM // lvsr v10,r0,r6 - LVSR (R11)(R0), V6 // lvsr v6,r0,r11 - LVX (INP)(R0), INPTAIL // lvx v5,r0,r3 - ADD $15, INP // addi r3,r3,15 - VXOR INPPERM, TMP, INPPERM // vxor v6, v3, v6 - LVSL (OUT)(R0), OUTPERM // lvsl v8,r0,r4 - VSPLTISB $-1, OUTMASK // vspltisb v9,-1 - LVX (OUT)(R0), OUTHEAD // lvx v7,r0,r4 - VPERM OUTMASK, RNDKEY0, OUTPERM, OUTMASK // vperm v9,v9,v0,v8 - VXOR OUTPERM, TMP, OUTPERM // vxor v8, v3, v8 - SRW $1, ROUNDS // rlwinm r9,r9,31,1,31 - - MOVD $16, IDX // li r10,16 +#ifdef GOARCH_ppc64le + MOVD $·rcon(SB), R11 + LVX (R11), ESPERM // Permute value for P8_ macros. +#endif + + CMPU LEN, $16 // cmpldi r5,16 + BC 14, 0, LR // bltlr-, return if len < 16. + CMPW ENC, $0 // cmpwi r8,0 + + P8_LXVB16X(IVP, R0, IVEC) // load ivec in BE register order + + SRW $1, ROUNDS // rlwinm r9,r9,31,1,31 + MOVD $0, IDX // li r10,0 ADD $-1, ROUNDS // addi r9,r9,-1 BEQ Lcbc_dec // beq PCALIGN $16 @@ -621,45 +593,34 @@ TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0 // Outer loop: initialize encrypted value (INOUT) // Load input (INPTAIL) ivec (IVEC) Lcbc_enc: - VOR INPTAIL, INPTAIL, INOUT // vor v2,v5,v5 - LVX (INP)(R0), INPTAIL // lvx v5,r0,r3 + P8_LXVB16X(INP, R0, INOUT) // load text in BE vreg order ADD $16, INP // addi r3,r3,16 MOVD ROUNDS, CTR // mtctr r9 ADD $-16, LEN // addi r5,r5,-16 - LVX (KEY)(R0), RNDKEY0 // lvx v0,r0,r6 - VPERM INOUT, INPTAIL, INPPERM, INOUT // vperm v2,v2,v5,v6 - LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 + P8_LXV(KEY, IDX, RNDKEY0) // load first xkey ADD $16, IDX // addi r10,r10,16 - VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10 VXOR INOUT, RNDKEY0, INOUT // vxor v2,v2,v0 - LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6 - ADD $16, IDX // addi r10,r10,16 VXOR INOUT, IVEC, INOUT // vxor v2,v2,v4 - // Encryption loop of INOUT using RNDKEY0 and RNDKEY1 + // Encryption loop of INOUT using RNDKEY0 Loop_cbc_enc: - VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v1,v0,v10 - VCIPHER INOUT, RNDKEY1, INOUT // vcipher v2,v2,v1 - LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 + P8_LXV(KEY, IDX, RNDKEY0) // load next xkey + VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v1 ADD $16, IDX // addi r10,r10,16 - VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v0,v1,v10 - VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v0 - LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6 + P8_LXV(KEY, IDX, RNDKEY0) // load next xkey + VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v1 ADD $16, IDX // addi r10,r10,16 - BC 16, 0, Loop_cbc_enc // bdnz Loop_cbc_enc + BDNZ Loop_cbc_enc // Encrypt tail values and store INOUT - VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v1,v0,v10 - VCIPHER INOUT, RNDKEY1, INOUT // vcipher v2,v2,v1 - LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 - MOVD $16, IDX // li r10,16 - VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v0,v1,v10 + P8_LXV(KEY, IDX, RNDKEY0) // load next xkey + VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v1 + ADD $16, IDX // addi r10,r10,16 + P8_LXV(KEY, IDX, RNDKEY0) // load final xkey VCIPHERLAST INOUT, RNDKEY0, IVEC // vcipherlast v4,v2,v0 + MOVD $0, IDX // reset key index for next block CMPU LEN, $16 // cmpldi r5,16 - VPERM IVEC, IVEC, OUTPERM, TMP // vperm v3,v4,v4,v8 - VSEL OUTHEAD, TMP, OUTMASK, INOUT // vsel v2,v7,v3,v9 - VOR TMP, TMP, OUTHEAD // vor v7,v3,v3 - STVX INOUT, (OUT)(R0) // stvx v2,r0,r4 + P8_STXVB16X(IVEC, OUT, R0) // store ciphertext in BE order ADD $16, OUT // addi r4,r4,16 BGE Lcbc_enc // bge Lcbc_enc BR Lcbc_done // b Lcbc_done @@ -667,69 +628,41 @@ Loop_cbc_enc: // Outer loop: initialize decrypted value (INOUT) // Load input (INPTAIL) ivec (IVEC) Lcbc_dec: - VOR INPTAIL, INPTAIL, TMP // vor v3,v5,v5 - LVX (INP)(R0), INPTAIL // lvx v5,r0,r3 + P8_LXVB16X(INP, R0, TMP) // load ciphertext in BE vreg order ADD $16, INP // addi r3,r3,16 MOVD ROUNDS, CTR // mtctr r9 ADD $-16, LEN // addi r5,r5,-16 - LVX (KEY)(R0), RNDKEY0 // lvx v0,r0,r6 - VPERM TMP, INPTAIL, INPPERM, TMP // vperm v3,v3,v5,v6 - LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 + P8_LXV(KEY, IDX, RNDKEY0) // load first xkey ADD $16, IDX // addi r10,r10,16 - VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10 VXOR TMP, RNDKEY0, INOUT // vxor v2,v3,v0 - LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6 - ADD $16, IDX // addi r10,r10,16 PCALIGN $16 - // Decryption loop of INOUT using RNDKEY0 and RNDKEY1 + // Decryption loop of INOUT using RNDKEY0 Loop_cbc_dec: - VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v0,v1,v10 - VNCIPHER INOUT, RNDKEY1, INOUT // vncipher v2,v2,v1 - LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 + P8_LXV(KEY, IDX, RNDKEY0) // load next xkey ADD $16, IDX // addi r10,r10,16 - VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10 - VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v0 - LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6 + VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v1 + P8_LXV(KEY, IDX, RNDKEY0) // load next xkey ADD $16, IDX // addi r10,r10,16 - BC 16, 0, Loop_cbc_dec // bdnz + VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v0 + BDNZ Loop_cbc_dec // Decrypt tail values and store INOUT - VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v0,v1,v10 - VNCIPHER INOUT, RNDKEY1, INOUT // vncipher v2,v2,v1 - LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 - MOVD $16, IDX // li r10,16 - VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10 + P8_LXV(KEY, IDX, RNDKEY0) // load next xkey + ADD $16, IDX // addi r10,r10,16 + VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v1 + P8_LXV(KEY, IDX, RNDKEY0) // load final xkey + MOVD $0, IDX // li r10,0 VNCIPHERLAST INOUT, RNDKEY0, INOUT // vncipherlast v2,v2,v0 CMPU LEN, $16 // cmpldi r5,16 VXOR INOUT, IVEC, INOUT // vxor v2,v2,v4 VOR TMP, TMP, IVEC // vor v4,v3,v3 - VPERM INOUT, INOUT, OUTPERM, TMP // vperm v3,v2,v2,v8 - VSEL OUTHEAD, TMP, OUTMASK, INOUT // vsel v2,v7,v3,v9 - VOR TMP, TMP, OUTHEAD // vor v7,v3,v3 - STVX INOUT, (OUT)(R0) // stvx v2,r0,r4 + P8_STXVB16X(INOUT, OUT, R0) // store text in BE order ADD $16, OUT // addi r4,r4,16 BGE Lcbc_dec // bge Lcbc_done: - ADD $-1, OUT // addi r4,r4,-1 - LVX (OUT)(R0), INOUT // lvx v2,r0,r4 - VSEL OUTHEAD, INOUT, OUTMASK, INOUT // vsel v2,v7,v2,v9 - STVX INOUT, (OUT)(R0) // stvx v2,r0,r4 - NEG IVP, ENC // neg r8,r7 - MOVD $15, IDX // li r10,15 - VXOR RNDKEY0, RNDKEY0, RNDKEY0 // vxor v0,v0,v0 - VSPLTISB $-1, OUTMASK // vspltisb v9,-1 - VSPLTISB $0xf, TMP // vspltisb v3, 0xf - LVSR (ENC)(R0), OUTPERM // lvsl v8,r0,r8 - VPERM OUTMASK, RNDKEY0, OUTPERM, OUTMASK // vperm v9,v9,v0,v8 - VXOR OUTPERM, TMP, OUTPERM // vxor v9, v3, v9 - LVX (IVP)(R0), OUTHEAD // lvx v7,r0,r7 - VPERM IVEC, IVEC, OUTPERM, IVEC // vperm v4,v4,v4,v8 - VSEL OUTHEAD, IVEC, OUTMASK, INOUT // vsel v2,v7,v4,v9 - LVX (IVP)(IDX), INPTAIL // lvx v5,r10,r7 - STVX INOUT, (IVP)(R0) // stvx v2,r0,r7 - VSEL IVEC, INPTAIL, OUTMASK, INOUT // vsel v2,v4,v5,v9 - STVX INOUT, (IVP)(IDX) // stvx v2,r10,r7 + VXOR RNDKEY0, RNDKEY0, RNDKEY0 // clear key register + P8_STXVB16X(IVEC, R0, IVP) // Save ivec in BE order for next round. RET // bclr 20,lt,0 diff --git a/src/crypto/aes/cbc_ppc64le.go b/src/crypto/aes/cbc_ppc64x.go similarity index 98% rename from src/crypto/aes/cbc_ppc64le.go rename to src/crypto/aes/cbc_ppc64x.go index cb9ff4c843..4c9257cdd2 100644 --- a/src/crypto/aes/cbc_ppc64le.go +++ b/src/crypto/aes/cbc_ppc64x.go @@ -2,6 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build ppc64 || ppc64le + package aes import (