//go:build ppc64 || ppc64le
-// Based on CRYPTOGAMS code with the following comment:
+// Portions based on CRYPTOGAMS code with the following comment:
// # ====================================================================
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
// # project. The module is, however, dual licensed under OpenSSL and
// # details see http://www.openssl.org/~appro/cryptogams/.
// # ====================================================================
-// This implementation is based on the ppc64 asm generated by the
-// script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
+// The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm
+// from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
// from commit d47afb3c.
// Changes were made due to differences in the ABI and some register usage.
// Some arguments were changed due to the way the Go code passes them.
+// Portions that use the stitched AES-GCM approach in counterCryptASM
+// are based on code found in
+// https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s
+
#include "textflag.h"
#define XIP R3
#define VIN0 VIN
+#define ESPERM V10
+#define TMP2 V11
+
+// The following macros provide appropriate
+// implementations for endianness as well as
+// ISA specific for power8 and power9.
+#ifdef GOARCH_ppc64le
+# ifdef GOPPC64_power9
+#define P8_LXVB16X(RA,RB,VT) LXVB16X (RA)(RB), VT
+#define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA)(RB)
+# else
+#define NEEDS_ESPERM
+#define P8_LXVB16X(RA,RB,VT) \
+ LXVD2X (RA+RB), VT \
+ VPERM VT, VT, ESPERM, VT
+
+#define P8_STXVB16X(VS,RA,RB) \
+ VPERM VS, VS, ESPERM, TMP2; \
+ STXVD2X TMP2, (RA+RB)
+
+# endif
+#else
+#define P8_LXVB16X(RA,RB,VT) \
+ LXVD2X (RA+RB), VT
+
+#define P8_STXVB16X(VS,RA,RB) \
+ STXVD2X VS, (RA+RB)
+
+#endif
+
+#define MASK_PTR R8
+
+#define MASKV V0
+#define INV V1
+
+// The following macros are used for
+// the stitched implementation within
+// counterCryptASM.
+
+// Load the initial GCM counter value
+// in V30 and set up the counter increment
+// in V31
+#define SETUP_COUNTER \
+ P8_LXVB16X(COUNTER, R0, V30); \
+ VSPLTISB $1, V28; \
+ VXOR V31, V31, V31; \
+ VSLDOI $1, V31, V28, V31
+
+// These macros set up the initial value
+// for a single encryption, or 4 or 8
+// stitched encryptions implemented
+// with interleaving vciphers.
+//
+// The input value for each encryption
+// is generated by XORing the counter
+// from V30 with the first key in VS0
+// and incrementing the counter.
+//
+// Single encryption in V15
+#define GEN_VCIPHER_INPUT \
+ XXLOR VS0, VS0, V29 \
+ VXOR V30, V29, V15; \
+ VADDUWM V30, V31, V30
+
+// 4 encryptions in V15 - V18
+#define GEN_VCIPHER_4_INPUTS \
+ XXLOR VS0, VS0, V29; \
+ VXOR V30, V29, V15; \
+ VADDUWM V30, V31, V30; \
+ VXOR V30, V29, V16; \
+ VADDUWM V30, V31, V30; \
+ VXOR V30, V29, V17; \
+ VADDUWM V30, V31, V30; \
+ VXOR V30, V29, V18; \
+ VADDUWM V30, V31, V30
+
+// 8 encryptions in V15 - V22
+#define GEN_VCIPHER_8_INPUTS \
+ XXLOR VS0, VS0, V29; \
+ VXOR V30, V29, V15; \
+ VADDUWM V30, V31, V30; \
+ VXOR V30, V29, V16; \
+ VADDUWM V30, V31, V30; \
+ VXOR V30, V29, V17; \
+ VADDUWM V30, V31, V30; \
+ VXOR V30, V29, V18; \
+ VADDUWM V30, V31, V30; \
+ VXOR V30, V29, V19; \
+ VADDUWM V30, V31, V30; \
+ VXOR V30, V29, V20; \
+ VADDUWM V30, V31, V30; \
+ VXOR V30, V29, V21; \
+ VADDUWM V30, V31, V30; \
+ VXOR V30, V29, V22; \
+ VADDUWM V30, V31, V30
+
+// Load the keys to be used for
+// encryption based on key_len.
+// Keys are in VS0 - VS14
+// depending on key_len.
+// Valid keys sizes are verified
+// here. CR2 is set and used
+// throughout to check key_len.
+#define LOAD_KEYS(blk_key, key_len) \
+ MOVD $16, R16; \
+ MOVD $32, R17; \
+ MOVD $48, R18; \
+ MOVD $64, R19; \
+ LXVD2X (blk_key)(R0), VS0; \
+ LXVD2X (blk_key)(R16), VS1; \
+ LXVD2X (blk_key)(R17), VS2; \
+ LXVD2X (blk_key)(R18), VS3; \
+ LXVD2X (blk_key)(R19), VS4; \
+ ADD $64, R16; \
+ ADD $64, R17; \
+ ADD $64, R18; \
+ ADD $64, R19; \
+ LXVD2X (blk_key)(R16), VS5; \
+ LXVD2X (blk_key)(R17), VS6; \
+ LXVD2X (blk_key)(R18), VS7; \
+ LXVD2X (blk_key)(R19), VS8; \
+ ADD $64, R16; \
+ ADD $64, R17; \
+ ADD $64, R18; \
+ ADD $64, R19; \
+ LXVD2X (blk_key)(R16), VS9; \
+ LXVD2X (blk_key)(R17), VS10; \
+ CMP key_len, $12, CR2; \
+ CMP key_len, $10; \
+ BEQ keysLoaded; \
+ LXVD2X (blk_key)(R18), VS11; \
+ LXVD2X (blk_key)(R19), VS12; \
+ BEQ CR2, keysLoaded; \
+ ADD $64, R16; \
+ ADD $64, R17; \
+ LXVD2X (blk_key)(R16), VS13; \
+ LXVD2X (blk_key)(R17), VS14; \
+ CMP key_len, $14; \
+ BEQ keysLoaded; \
+ MOVD R0,0(R0); \
+keysLoaded:
+
+// Encrypt 1 (vin) with first 9
+// keys from VS1 - VS9.
+#define VCIPHER_1X9_KEYS(vin) \
+ XXLOR VS1, VS1, V23; \
+ XXLOR VS2, VS2, V24; \
+ XXLOR VS3, VS3, V25; \
+ XXLOR VS4, VS4, V26; \
+ XXLOR VS5, VS5, V27; \
+ VCIPHER vin, V23, vin; \
+ VCIPHER vin, V24, vin; \
+ VCIPHER vin, V25, vin; \
+ VCIPHER vin, V26, vin; \
+ VCIPHER vin, V27, vin; \
+ XXLOR VS6, VS6, V23; \
+ XXLOR VS7, VS7, V24; \
+ XXLOR VS8, VS8, V25; \
+ XXLOR VS9, VS9, V26; \
+ VCIPHER vin, V23, vin; \
+ VCIPHER vin, V24, vin; \
+ VCIPHER vin, V25, vin; \
+ VCIPHER vin, V26, vin
+
+// Encrypt 1 value (vin) with
+// 2 specified keys
+#define VCIPHER_1X2_KEYS(vin, key1, key2) \
+ XXLOR key1, key1, V25; \
+ XXLOR key2, key2, V26; \
+ VCIPHER vin, V25, vin; \
+ VCIPHER vin, V26, vin
+
+// Encrypt 4 values in V15 - V18
+// with the specified key from
+// VS1 - VS9.
+#define VCIPHER_4X1_KEY(key) \
+ XXLOR key, key, V23; \
+ VCIPHER V15, V23, V15; \
+ VCIPHER V16, V23, V16; \
+ VCIPHER V17, V23, V17; \
+ VCIPHER V18, V23, V18
+
+// Encrypt 8 values in V15 - V22
+// with the specified key,
+// assuming it is a VSreg
+#define VCIPHER_8X1_KEY(key) \
+ XXLOR key, key, V23; \
+ VCIPHER V15, V23, V15; \
+ VCIPHER V16, V23, V16; \
+ VCIPHER V17, V23, V17; \
+ VCIPHER V18, V23, V18; \
+ VCIPHER V19, V23, V19; \
+ VCIPHER V20, V23, V20; \
+ VCIPHER V21, V23, V21; \
+ VCIPHER V22, V23, V22
+
+// Load input block into V1-V4
+// in big endian order and
+// update blk_inp by 64.
+#define LOAD_INPUT_BLOCK64(blk_inp) \
+ MOVD $16, R16; \
+ MOVD $32, R17; \
+ MOVD $48, R18; \
+ P8_LXVB16X(blk_inp,R0,V1); \
+ P8_LXVB16X(blk_inp,R16,V2); \
+ P8_LXVB16X(blk_inp,R17,V3); \
+ P8_LXVB16X(blk_inp,R18,V4); \
+ ADD $64, blk_inp
+
+// Load input block into V1-V8
+// in big endian order and
+// Update blk_inp by 128
+#define LOAD_INPUT_BLOCK128(blk_inp) \
+ MOVD $16, R16; \
+ MOVD $32, R17; \
+ MOVD $48, R18; \
+ MOVD $64, R19; \
+ MOVD $80, R20; \
+ MOVD $96, R21; \
+ MOVD $112, R22; \
+ P8_LXVB16X(blk_inp,R0,V1); \
+ P8_LXVB16X(blk_inp,R16,V2); \
+ P8_LXVB16X(blk_inp,R17,V3); \
+ P8_LXVB16X(blk_inp,R18,V4); \
+ P8_LXVB16X(blk_inp,R19,V5); \
+ P8_LXVB16X(blk_inp,R20,V6); \
+ P8_LXVB16X(blk_inp,R21,V7); \
+ P8_LXVB16X(blk_inp,R22,V8); \
+ ADD $128, blk_inp
+
+// Finish encryption on 8 streams and
+// XOR with input block
+#define VCIPHERLAST8_XOR_INPUT \
+ VCIPHERLAST V15, V23, V15; \
+ VCIPHERLAST V16, V23, V16; \
+ VCIPHERLAST V17, V23, V17; \
+ VCIPHERLAST V18, V23, V18; \
+ VCIPHERLAST V19, V23, V19; \
+ VCIPHERLAST V20, V23, V20; \
+ VCIPHERLAST V21, V23, V21; \
+ VCIPHERLAST V22, V23, V22; \
+ XXLXOR V1, V15, V1; \
+ XXLXOR V2, V16, V2; \
+ XXLXOR V3, V17, V3; \
+ XXLXOR V4, V18, V4; \
+ XXLXOR V5, V19, V5; \
+ XXLXOR V6, V20, V6; \
+ XXLXOR V7, V21, V7; \
+ XXLXOR V8, V22, V8
+
+// Finish encryption on 4 streams and
+// XOR with input block
+#define VCIPHERLAST4_XOR_INPUT \
+ VCIPHERLAST V15, V23, V15; \
+ VCIPHERLAST V16, V23, V16; \
+ VCIPHERLAST V17, V23, V17; \
+ VCIPHERLAST V18, V23, V18; \
+ XXLXOR V1, V15, V1; \
+ XXLXOR V2, V16, V2; \
+ XXLXOR V3, V17, V3; \
+ XXLXOR V4, V18, V4
+
+// Store output block from V1-V8
+// in big endian order and
+// Update blk_out by 128
+#define STORE_OUTPUT_BLOCK128(blk_out) \
+ P8_STXVB16X(V1,blk_out,R0); \
+ P8_STXVB16X(V2,blk_out,R16); \
+ P8_STXVB16X(V3,blk_out,R17); \
+ P8_STXVB16X(V4,blk_out,R18); \
+ P8_STXVB16X(V5,blk_out,R19); \
+ P8_STXVB16X(V6,blk_out,R20); \
+ P8_STXVB16X(V7,blk_out,R21); \
+ P8_STXVB16X(V8,blk_out,R22); \
+ ADD $128, blk_out
+
+// Store output block from V1-V4
+// in big endian order and
+// Update blk_out by 64
+#define STORE_OUTPUT_BLOCK64(blk_out) \
+ P8_STXVB16X(V1,blk_out,R0); \
+ P8_STXVB16X(V2,blk_out,R16); \
+ P8_STXVB16X(V3,blk_out,R17); \
+ P8_STXVB16X(V4,blk_out,R18); \
+ ADD $64, blk_out
+
// func gcmInit(productTable *[256]byte, h []byte)
TEXT ·gcmInit(SB), NOSPLIT, $0-32
MOVD productTable+0(FP), XIP
#endif
STXVD2X VXL, (XIP+R0) // write out Xi
RET
+
+#define BLK_INP R3
+#define BLK_OUT R4
+#define BLK_KEY R5
+#define KEY_LEN R6
+#define BLK_IDX R7
+#define IDX R8
+#define IN_LEN R9
+#define COUNTER R10
+#define CONPTR R14
+#define MASK V5
+
+// Implementation of the counterCrypt function in assembler.
+// Original loop is unrolled to allow for multiple encryption
+// streams to be done in parallel, which is achieved by interleaving
+// vcipher instructions from each stream. This is also referred to as
+// stitching, and provides significant performance improvements.
+// Some macros are defined which enable execution for big or little
+// endian as well as different ISA targets.
+//func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32)
+//func counterCryptASM(xr, out, in, counter, key)
+TEXT ·counterCryptASM(SB), NOSPLIT, $16-72
+ MOVD xr(FP), KEY_LEN
+ MOVD out+8(FP), BLK_OUT
+ MOVD out_len+16(FP), R8
+ MOVD in+32(FP), BLK_INP
+ MOVD in_len+40(FP), IN_LEN
+ MOVD counter+56(FP), COUNTER
+ MOVD key+64(FP), BLK_KEY
+
+// Set up permute string when needed.
+#ifdef NEEDS_ESPERM
+ MOVD $·rcon(SB), R14
+ LVX (R14), ESPERM // Permute value for P8_ macros.
+#endif
+ SETUP_COUNTER // V30 Counter V31 BE {0, 0, 0, 1}
+ LOAD_KEYS(BLK_KEY, KEY_LEN) // VS1 - VS10/12/14 based on keysize
+ CMP IN_LEN, $128
+ BLT block64
+block128_loop:
+ // Do 8 encryptions in parallel by setting
+ // input values in V15-V22 and executing
+ // vcipher on the updated value and the keys.
+ GEN_VCIPHER_8_INPUTS
+ VCIPHER_8X1_KEY(VS1)
+ VCIPHER_8X1_KEY(VS2)
+ VCIPHER_8X1_KEY(VS3)
+ VCIPHER_8X1_KEY(VS4)
+ VCIPHER_8X1_KEY(VS5)
+ VCIPHER_8X1_KEY(VS6)
+ VCIPHER_8X1_KEY(VS7)
+ VCIPHER_8X1_KEY(VS8)
+ VCIPHER_8X1_KEY(VS9)
+ // Additional encryptions are done based on
+ // the key length, with the last key moved
+ // to V23 for use with VCIPHERLAST.
+ // CR2 = CMP key_len, $12
+ XXLOR VS10, VS10, V23
+ BLT CR2, block128_last // key_len = 10
+ VCIPHER_8X1_KEY(VS10)
+ VCIPHER_8X1_KEY(VS11)
+ XXLOR VS12,VS12,V23
+ BEQ CR2, block128_last // ken_len = 12
+ VCIPHER_8X1_KEY(VS12)
+ VCIPHER_8X1_KEY(VS13)
+ XXLOR VS14,VS14,V23 // key_len = 14
+block128_last:
+ // vcipher encryptions are in V15-V22 at this
+ // point with vcipherlast remaining to be done.
+ // Load input block into V1-V8, setting index offsets
+ // in R16-R22 to use with the STORE.
+ LOAD_INPUT_BLOCK128(BLK_INP)
+ // Do VCIPHERLAST on the last key for each encryption
+ // stream and XOR the result with the corresponding
+ // value from the input block.
+ VCIPHERLAST8_XOR_INPUT
+ // Store the results (8*16) and update BLK_OUT by 128.
+ STORE_OUTPUT_BLOCK128(BLK_OUT)
+ ADD $-128, IN_LEN // input size
+ CMP IN_LEN, $128 // check if >= blocksize
+ BGE block128_loop // next input block
+ CMP IN_LEN, $0
+ BEQ done
+block64:
+ CMP IN_LEN, $64 // Check if >= 64
+ BLT block16_loop
+ // Do 4 encryptions in parallel by setting
+ // input values in V15-V18 and executing
+ // vcipher on the updated value and the keys.
+ GEN_VCIPHER_4_INPUTS
+ VCIPHER_4X1_KEY(VS1)
+ VCIPHER_4X1_KEY(VS2)
+ VCIPHER_4X1_KEY(VS3)
+ VCIPHER_4X1_KEY(VS4)
+ VCIPHER_4X1_KEY(VS5)
+ VCIPHER_4X1_KEY(VS6)
+ VCIPHER_4X1_KEY(VS7)
+ VCIPHER_4X1_KEY(VS8)
+ VCIPHER_4X1_KEY(VS9)
+ // Check key length based on CR2
+ // Move last key to V23 for use with later vcipherlast
+ XXLOR VS10, VS10, V23
+ BLT CR2, block64_last // size = 10
+ VCIPHER_4X1_KEY(VS10) // Encrypt next 2 keys
+ VCIPHER_4X1_KEY(VS11)
+ XXLOR VS12, VS12, V23
+ BEQ CR2, block64_last // size = 12
+ VCIPHER_4X1_KEY(VS12) // Encrypt last 2 keys
+ VCIPHER_4X1_KEY(VS13)
+ XXLOR VS14, VS14, V23 // size = 14
+block64_last:
+ LOAD_INPUT_BLOCK64(BLK_INP) // Load 64 bytes of input
+ // Do VCIPHERLAST on the last for each encryption
+ // stream and XOR the result with the corresponding
+ // value from the input block.
+ VCIPHERLAST4_XOR_INPUT
+ // Store the results (4*16) and update BLK_OUT by 64.
+ STORE_OUTPUT_BLOCK64(BLK_OUT)
+ ADD $-64, IN_LEN // decrement input block length
+ CMP IN_LEN, $0 // check for remaining length
+ BEQ done
+block16_loop:
+ CMP IN_LEN, $16 // More input
+ BLT final_block // If not, then handle partial block
+ // Single encryption, no stitching
+ GEN_VCIPHER_INPUT // Generate input value for single encryption
+ VCIPHER_1X9_KEYS(V15) // Encrypt V15 value with 9 keys
+ XXLOR VS10, VS10, V23 // Last key -> V23 for later vcipiherlast
+ // Key length based on CR2. (LT=10, EQ=12, GT=14)
+ BLT CR2, block16_last // Finish for key size 10
+ VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys
+ XXLOR VS12, VS12, V23 // Last key -> V23 for later vcipherlast
+ BEQ CR2, block16_last // Finish for key size 12
+ VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
+ XXLOR VS14, VS14, V23 // Last key -> V23 for vcipherlast with key size 14
+block16_last:
+ P8_LXVB16X(BLK_INP, R0, V1) // Load input
+ VCIPHERLAST V15, V23, V15 // Encrypt last value in V23
+ XXLXOR V15, V1, V1 // XOR with input
+ P8_STXVB16X(V1,R0,BLK_OUT) // Store final encryption value to output
+ ADD $16, BLK_INP // Increment input pointer
+ ADD $16, BLK_OUT // Increment output pointer
+ ADD $-16, IN_LEN // Decrement input length
+ BR block16_loop // Check for next
+final_block:
+ CMP IN_LEN, $0
+ BEQ done
+ GEN_VCIPHER_INPUT // Generate input value for partial encryption
+ VCIPHER_1X9_KEYS(V15) // Encrypt V15 with 9 keys
+ XXLOR VS10, VS10, V23 // Save possible last key
+ BLT CR2, final_block_last
+ VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with next 2 keys
+ XXLOR VS12, VS12, V23 // Save possible last key
+ BEQ CR2, final_block_last
+ VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
+ XXLOR VS14, VS14, V23 // Save last key
+final_block_last:
+ VCIPHERLAST V15, V23, V15 // Finish encryption
+#ifdef GOPPC64_power10
+ // set up length
+ SLD $56, IN_LEN, R17
+ LXVLL BLK_INP, R17, V25
+ VXOR V25, V15, V25
+ STXVLL V25, BLK_OUT, R17
+#else
+ ADD $32, R1, MASK_PTR
+ MOVD $0, R16
+ P8_STXVB16X(V15, MASK_PTR, R0)
+ CMP IN_LEN, $8
+ BLT next4
+ MOVD 0(MASK_PTR), R14
+ MOVD 0(BLK_INP), R15
+ XOR R14, R15, R14
+ MOVD R14, 0(BLK_OUT)
+ ADD $8, R16
+ ADD $-8, IN_LEN
+next4:
+ CMP IN_LEN, $4
+ BLT next2
+ MOVWZ (BLK_INP)(R16), R15
+ MOVWZ (MASK_PTR)(R16), R14
+ XOR R14, R15, R14
+ MOVW R14, (R16)(BLK_OUT)
+ ADD $4, R16
+ ADD $-4, IN_LEN
+next2:
+ CMP IN_LEN, $2
+ BLT next1
+ MOVHZ (BLK_INP)(R16), R15
+ MOVHZ (MASK_PTR)(R16), R14
+ XOR R14, R15, R14
+ MOVH R14, (R16)(BLK_OUT)
+ ADD $2, R16
+ ADD $-2, IN_LEN
+next1:
+ CMP IN_LEN, $1
+ BLT done
+ MOVBZ (MASK_PTR)(R16), R14
+ MOVBZ (BLK_INP)(R16), R15
+ XOR R14, R15, R14
+ MOVB R14, (R16)(BLK_OUT)
+#endif
+done:
+ // Save the updated counter value
+ P8_STXVB16X(V30, COUNTER, R0)
+ // Clear the keys
+ XXLXOR VS0, VS0, VS0
+ XXLXOR VS1, VS1, VS1
+ XXLXOR VS2, VS2, VS2
+ XXLXOR VS3, VS3, VS3
+ XXLXOR VS4, VS4, VS4
+ XXLXOR VS5, VS5, VS5
+ XXLXOR VS6, VS6, VS6
+ XXLXOR VS7, VS7, VS7
+ XXLXOR VS8, VS8, VS8
+ XXLXOR VS9, VS9, VS9
+ XXLXOR VS10, VS10, VS10
+ XXLXOR VS11, VS11, VS11
+ XXLXOR VS12, VS12, VS12
+ XXLXOR VS13, VS13, VS13
+ XXLXOR VS14, VS14, VS14
+ RET
+