crypto/aes,crypto/cipher: improve gcm performance on ppc64x

author Lynn Boger <laboger@linux.vnet.ibm.com>

Tue, 11 Apr 2023 19:49:29 +0000 (14:49 -0500)

committer Lynn Boger <laboger@linux.vnet.ibm.com>

Mon, 9 Oct 2023 18:53:44 +0000 (18:53 +0000)
author Lynn Boger <laboger@linux.vnet.ibm.com>
Tue, 11 Apr 2023 19:49:29 +0000 (14:49 -0500)
committer Lynn Boger <laboger@linux.vnet.ibm.com>
Mon, 9 Oct 2023 18:53:44 +0000 (18:53 +0000)
diff --git a/src/crypto/aes/gcm_ppc64x.go b/src/crypto/aes/gcm_ppc64x.go

index 44b27056d6bd072c0848b010d88c16873358239a..3dbf4ba5782ac9d021705d07fe68493a75231b1a 100644 (file)
--- a/src/crypto/aes/gcm_ppc64x.go
+++ b/src/crypto/aes/gcm_ppc64x.go
@@ -51,6 +51,8 @@ type gcmAsm struct {
         tagSize int
  }
  
+func counterCryptASM(nr int, out, in []byte, counter *[gcmBlockSize]byte, key *uint32)
+
  // NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only
  // called by crypto/cipher.NewGCM via the gcmAble interface.
  func (c *aesCipherAsm) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
@@ -114,34 +116,10 @@ func (g *gcmAsm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) {
  // into out. counter is the initial count value and will be updated with the next
  // count value. The length of out must be greater than or equal to the length
  // of in.
+// counterCryptASM implements counterCrypt which then allows the loop to
+// be unrolled and optimized.
  func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
-       var mask [gcmBlockSize]byte
-
-       for len(in) >= gcmBlockSize {
-               // Hint to avoid bounds check
-               _, _ = in[15], out[15]
-               g.cipher.Encrypt(mask[:], counter[:])
-               gcmInc32(counter)
-
-               // XOR 16 bytes each loop iteration in 8 byte chunks
-               in0 := binary.LittleEndian.Uint64(in[0:])
-               in1 := binary.LittleEndian.Uint64(in[8:])
-               m0 := binary.LittleEndian.Uint64(mask[:8])
-               m1 := binary.LittleEndian.Uint64(mask[8:])
-               binary.LittleEndian.PutUint64(out[:8], in0^m0)
-               binary.LittleEndian.PutUint64(out[8:], in1^m1)
-               out = out[16:]
-               in = in[16:]
-       }
-
-       if len(in) > 0 {
-               g.cipher.Encrypt(mask[:], counter[:])
-               gcmInc32(counter)
-               // XOR leftover bytes
-               for i, inb := range in {
-                       out[i] = inb ^ mask[i]
-               }
-       }
+       counterCryptASM(len(g.cipher.enc)/4-1, out, in, counter, &g.cipher.enc[0])
  }
  
  // increments the rightmost 32-bits of the count value by 1.
diff --git a/src/crypto/aes/gcm_ppc64x.s b/src/crypto/aes/gcm_ppc64x.s

index 72f0b8e01c06a4e8cca1c015ba473d5a4a044630..f661b2764279c9215d114fab6d4293e1c877161a 100644 (file)
--- a/src/crypto/aes/gcm_ppc64x.s
+++ b/src/crypto/aes/gcm_ppc64x.s
@@ -4,7 +4,7 @@
  
  //go:build ppc64 || ppc64le
  
-// Based on CRYPTOGAMS code with the following comment:
+// Portions based on CRYPTOGAMS code with the following comment:
  // # ====================================================================
  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  // # project. The module is, however, dual licensed under OpenSSL and
@@ -12,13 +12,17 @@
  // # details see http://www.openssl.org/~appro/cryptogams/.
  // # ====================================================================
  
-// This implementation is based on the ppc64 asm generated by the
-// script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
+// The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm
+// from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
  // from commit d47afb3c.
  
  // Changes were made due to differences in the ABI and some register usage.
  // Some arguments were changed due to the way the Go code passes them.
  
+// Portions that use the stitched AES-GCM approach in counterCryptASM
+// are based on code found in
+// https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s
+
  #include "textflag.h"
  
  #define XIP    R3
@@ -87,6 +91,292 @@
  
  #define VIN0   VIN
  
+#define ESPERM V10
+#define TMP2 V11
+
+// The following macros provide appropriate
+// implementations for endianness as well as
+// ISA specific for power8 and power9.
+#ifdef GOARCH_ppc64le
+#  ifdef GOPPC64_power9
+#define P8_LXVB16X(RA,RB,VT)   LXVB16X (RA)(RB), VT
+#define P8_STXVB16X(VS,RA,RB)  STXVB16X VS, (RA)(RB)
+#  else
+#define NEEDS_ESPERM
+#define P8_LXVB16X(RA,RB,VT) \
+       LXVD2X  (RA+RB), VT \
+       VPERM   VT, VT, ESPERM, VT
+
+#define P8_STXVB16X(VS,RA,RB) \
+       VPERM   VS, VS, ESPERM, TMP2; \
+       STXVD2X TMP2, (RA+RB)
+
+#  endif
+#else
+#define P8_LXVB16X(RA,RB,VT) \
+       LXVD2X  (RA+RB), VT
+
+#define P8_STXVB16X(VS,RA,RB) \
+       STXVD2X VS, (RA+RB)
+
+#endif
+
+#define MASK_PTR   R8
+
+#define MASKV   V0
+#define INV     V1
+
+// The following macros are used for
+// the stitched implementation within
+// counterCryptASM.
+
+// Load the initial GCM counter value
+// in V30 and set up the counter increment
+// in V31
+#define SETUP_COUNTER \
+       P8_LXVB16X(COUNTER, R0, V30); \
+       VSPLTISB $1, V28; \
+       VXOR V31, V31, V31; \
+       VSLDOI $1, V31, V28, V31
+
+// These macros set up the initial value
+// for a single encryption, or 4 or 8
+// stitched encryptions implemented
+// with interleaving vciphers.
+//
+// The input value for each encryption
+// is generated by XORing the counter
+// from V30 with the first key in VS0
+// and incrementing the counter.
+//
+// Single encryption in V15
+#define GEN_VCIPHER_INPUT \
+       XXLOR VS0, VS0, V29 \
+       VXOR V30, V29, V15; \
+       VADDUWM V30, V31, V30
+
+// 4 encryptions in V15 - V18
+#define GEN_VCIPHER_4_INPUTS \
+       XXLOR VS0, VS0, V29; \
+       VXOR V30, V29, V15; \
+       VADDUWM V30, V31, V30; \
+       VXOR V30, V29, V16; \
+       VADDUWM V30, V31, V30; \
+       VXOR V30, V29, V17; \
+       VADDUWM V30, V31, V30; \
+       VXOR V30, V29, V18; \
+       VADDUWM V30, V31, V30
+
+// 8 encryptions in V15 - V22
+#define GEN_VCIPHER_8_INPUTS \
+       XXLOR VS0, VS0, V29; \
+       VXOR V30, V29, V15; \
+       VADDUWM V30, V31, V30; \
+       VXOR V30, V29, V16; \
+       VADDUWM V30, V31, V30; \
+       VXOR V30, V29, V17; \
+       VADDUWM V30, V31, V30; \
+       VXOR V30, V29, V18; \
+       VADDUWM V30, V31, V30; \
+       VXOR V30, V29, V19; \
+       VADDUWM V30, V31, V30; \
+       VXOR V30, V29, V20; \
+       VADDUWM V30, V31, V30; \
+       VXOR V30, V29, V21; \
+       VADDUWM V30, V31, V30; \
+       VXOR V30, V29, V22; \
+       VADDUWM V30, V31, V30
+
+// Load the keys to be used for
+// encryption based on key_len.
+// Keys are in VS0 - VS14
+// depending on key_len.
+// Valid keys sizes are verified
+// here. CR2 is set and used
+// throughout to check key_len.
+#define LOAD_KEYS(blk_key, key_len) \
+       MOVD    $16, R16; \
+       MOVD    $32, R17; \
+       MOVD    $48, R18; \
+       MOVD    $64, R19; \
+       LXVD2X (blk_key)(R0), VS0; \
+       LXVD2X (blk_key)(R16), VS1; \
+       LXVD2X (blk_key)(R17), VS2; \
+       LXVD2X (blk_key)(R18), VS3; \
+       LXVD2X (blk_key)(R19), VS4; \
+       ADD $64, R16; \
+       ADD $64, R17; \
+       ADD $64, R18; \
+       ADD $64, R19; \
+       LXVD2X (blk_key)(R16), VS5; \
+       LXVD2X (blk_key)(R17), VS6; \
+       LXVD2X (blk_key)(R18), VS7; \
+       LXVD2X (blk_key)(R19), VS8; \
+       ADD $64, R16; \
+       ADD $64, R17; \
+       ADD $64, R18; \
+       ADD $64, R19; \
+       LXVD2X (blk_key)(R16), VS9; \
+       LXVD2X (blk_key)(R17), VS10; \
+       CMP key_len, $12, CR2; \
+       CMP key_len, $10; \
+       BEQ keysLoaded; \
+       LXVD2X (blk_key)(R18), VS11; \
+       LXVD2X (blk_key)(R19), VS12; \
+       BEQ CR2, keysLoaded; \
+       ADD $64, R16; \
+       ADD $64, R17; \
+       LXVD2X (blk_key)(R16), VS13; \
+       LXVD2X (blk_key)(R17), VS14; \
+       CMP key_len, $14; \
+       BEQ keysLoaded; \
+       MOVD R0,0(R0); \
+keysLoaded:
+
+// Encrypt 1 (vin) with first 9
+// keys from VS1 - VS9.
+#define VCIPHER_1X9_KEYS(vin) \
+       XXLOR VS1, VS1, V23; \
+       XXLOR VS2, VS2, V24; \
+       XXLOR VS3, VS3, V25; \
+       XXLOR VS4, VS4, V26; \
+       XXLOR VS5, VS5, V27; \
+       VCIPHER vin, V23, vin; \
+       VCIPHER vin, V24, vin; \
+       VCIPHER vin, V25, vin; \
+       VCIPHER vin, V26, vin; \
+       VCIPHER vin, V27, vin; \
+       XXLOR VS6, VS6, V23; \
+       XXLOR VS7, VS7, V24; \
+       XXLOR VS8, VS8, V25; \
+       XXLOR VS9, VS9, V26; \
+       VCIPHER vin, V23, vin; \
+       VCIPHER vin, V24, vin; \
+       VCIPHER vin, V25, vin; \
+       VCIPHER vin, V26, vin
+
+// Encrypt 1 value (vin) with
+// 2 specified keys
+#define VCIPHER_1X2_KEYS(vin, key1, key2) \
+       XXLOR key1, key1, V25; \
+       XXLOR key2, key2, V26; \
+       VCIPHER vin, V25, vin; \
+       VCIPHER vin, V26, vin
+
+// Encrypt 4 values in V15 - V18
+// with the specified key from
+// VS1 - VS9.
+#define VCIPHER_4X1_KEY(key) \
+       XXLOR key, key, V23; \
+       VCIPHER V15, V23, V15; \
+       VCIPHER V16, V23, V16; \
+       VCIPHER V17, V23, V17; \
+       VCIPHER V18, V23, V18
+
+// Encrypt 8 values in V15 - V22
+// with the specified key,
+// assuming it is a VSreg
+#define VCIPHER_8X1_KEY(key) \
+       XXLOR key, key, V23; \
+       VCIPHER V15, V23, V15; \
+       VCIPHER V16, V23, V16; \
+       VCIPHER V17, V23, V17; \
+       VCIPHER V18, V23, V18; \
+       VCIPHER V19, V23, V19; \
+       VCIPHER V20, V23, V20; \
+       VCIPHER V21, V23, V21; \
+       VCIPHER V22, V23, V22
+
+// Load input block into V1-V4
+// in big endian order and
+// update blk_inp by 64.
+#define LOAD_INPUT_BLOCK64(blk_inp) \
+       MOVD $16, R16; \
+       MOVD $32, R17; \
+       MOVD $48, R18; \
+       P8_LXVB16X(blk_inp,R0,V1); \
+       P8_LXVB16X(blk_inp,R16,V2); \
+       P8_LXVB16X(blk_inp,R17,V3); \
+       P8_LXVB16X(blk_inp,R18,V4); \
+       ADD $64, blk_inp
+
+// Load input block into V1-V8
+// in big endian order and
+// Update blk_inp by 128
+#define LOAD_INPUT_BLOCK128(blk_inp) \
+       MOVD $16, R16; \
+       MOVD $32, R17; \
+       MOVD $48, R18; \
+       MOVD $64, R19; \
+       MOVD $80, R20; \
+       MOVD $96, R21; \
+       MOVD $112, R22; \
+       P8_LXVB16X(blk_inp,R0,V1); \
+       P8_LXVB16X(blk_inp,R16,V2); \
+       P8_LXVB16X(blk_inp,R17,V3); \
+       P8_LXVB16X(blk_inp,R18,V4); \
+       P8_LXVB16X(blk_inp,R19,V5); \
+       P8_LXVB16X(blk_inp,R20,V6); \
+       P8_LXVB16X(blk_inp,R21,V7); \
+       P8_LXVB16X(blk_inp,R22,V8); \
+       ADD $128, blk_inp
+
+// Finish encryption on 8 streams and
+// XOR with input block
+#define VCIPHERLAST8_XOR_INPUT \
+       VCIPHERLAST     V15, V23, V15; \
+       VCIPHERLAST     V16, V23, V16; \
+       VCIPHERLAST     V17, V23, V17; \
+       VCIPHERLAST     V18, V23, V18; \
+       VCIPHERLAST     V19, V23, V19; \
+       VCIPHERLAST     V20, V23, V20; \
+       VCIPHERLAST     V21, V23, V21; \
+       VCIPHERLAST     V22, V23, V22; \
+       XXLXOR          V1, V15, V1; \
+       XXLXOR          V2, V16, V2; \
+       XXLXOR          V3, V17, V3; \
+       XXLXOR          V4, V18, V4; \
+       XXLXOR          V5, V19, V5; \
+       XXLXOR          V6, V20, V6; \
+       XXLXOR          V7, V21, V7; \
+       XXLXOR          V8, V22, V8
+
+// Finish encryption on 4 streams and
+// XOR with input block
+#define VCIPHERLAST4_XOR_INPUT \
+       VCIPHERLAST     V15, V23, V15; \
+       VCIPHERLAST     V16, V23, V16; \
+       VCIPHERLAST     V17, V23, V17; \
+       VCIPHERLAST     V18, V23, V18; \
+       XXLXOR          V1, V15, V1; \
+       XXLXOR          V2, V16, V2; \
+       XXLXOR          V3, V17, V3; \
+       XXLXOR          V4, V18, V4
+
+// Store output block from V1-V8
+// in big endian order and
+// Update blk_out by 128
+#define STORE_OUTPUT_BLOCK128(blk_out) \
+       P8_STXVB16X(V1,blk_out,R0); \
+       P8_STXVB16X(V2,blk_out,R16); \
+       P8_STXVB16X(V3,blk_out,R17); \
+       P8_STXVB16X(V4,blk_out,R18); \
+       P8_STXVB16X(V5,blk_out,R19); \
+       P8_STXVB16X(V6,blk_out,R20); \
+       P8_STXVB16X(V7,blk_out,R21); \
+       P8_STXVB16X(V8,blk_out,R22); \
+       ADD $128, blk_out
+
+// Store output block from V1-V4
+// in big endian order and
+// Update blk_out by 64
+#define STORE_OUTPUT_BLOCK64(blk_out) \
+       P8_STXVB16X(V1,blk_out,R0); \
+       P8_STXVB16X(V2,blk_out,R16); \
+       P8_STXVB16X(V3,blk_out,R17); \
+       P8_STXVB16X(V4,blk_out,R18); \
+       ADD $64, blk_out
+
  // func gcmInit(productTable *[256]byte, h []byte)
  TEXT ·gcmInit(SB), NOSPLIT, $0-32
         MOVD productTable+0(FP), XIP
@@ -588,3 +878,226 @@ TEXT ·gcmMul(SB), NOSPLIT, $0-32
  #endif
         STXVD2X VXL, (XIP+R0)      // write out Xi
         RET
+
+#define BLK_INP    R3
+#define BLK_OUT    R4
+#define BLK_KEY    R5
+#define KEY_LEN    R6
+#define BLK_IDX    R7
+#define IDX        R8
+#define IN_LEN     R9
+#define COUNTER    R10
+#define CONPTR     R14
+#define MASK       V5
+
+// Implementation of the counterCrypt function in assembler.
+// Original loop is unrolled to allow for multiple encryption
+// streams to be done in parallel, which is achieved by interleaving
+// vcipher instructions from each stream. This is also referred to as
+// stitching, and provides significant performance improvements.
+// Some macros are defined which enable execution for big or little
+// endian as well as different ISA targets.
+//func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32)
+//func counterCryptASM(xr, out, in, counter, key)
+TEXT ·counterCryptASM(SB), NOSPLIT, $16-72
+       MOVD    xr(FP), KEY_LEN
+       MOVD    out+8(FP), BLK_OUT
+       MOVD    out_len+16(FP), R8
+       MOVD    in+32(FP), BLK_INP
+       MOVD    in_len+40(FP), IN_LEN
+       MOVD    counter+56(FP), COUNTER
+       MOVD    key+64(FP), BLK_KEY
+
+// Set up permute string when needed.
+#ifdef NEEDS_ESPERM
+       MOVD    $·rcon(SB), R14
+       LVX     (R14), ESPERM   // Permute value for P8_ macros.
+#endif
+       SETUP_COUNTER           // V30 Counter V31 BE {0, 0, 0, 1}
+       LOAD_KEYS(BLK_KEY, KEY_LEN)     // VS1 - VS10/12/14 based on keysize
+       CMP     IN_LEN, $128
+       BLT     block64
+block128_loop:
+       // Do 8 encryptions in parallel by setting
+       // input values in V15-V22 and executing
+       // vcipher on the updated value and the keys.
+       GEN_VCIPHER_8_INPUTS
+       VCIPHER_8X1_KEY(VS1)
+       VCIPHER_8X1_KEY(VS2)
+       VCIPHER_8X1_KEY(VS3)
+       VCIPHER_8X1_KEY(VS4)
+       VCIPHER_8X1_KEY(VS5)
+       VCIPHER_8X1_KEY(VS6)
+       VCIPHER_8X1_KEY(VS7)
+       VCIPHER_8X1_KEY(VS8)
+       VCIPHER_8X1_KEY(VS9)
+       // Additional encryptions are done based on
+       // the key length, with the last key moved
+       // to V23 for use with VCIPHERLAST.
+       // CR2 = CMP key_len, $12
+       XXLOR VS10, VS10, V23
+       BLT     CR2, block128_last // key_len = 10
+       VCIPHER_8X1_KEY(VS10)
+       VCIPHER_8X1_KEY(VS11)
+       XXLOR VS12,VS12,V23
+       BEQ     CR2, block128_last // ken_len = 12
+       VCIPHER_8X1_KEY(VS12)
+       VCIPHER_8X1_KEY(VS13)
+       XXLOR VS14,VS14,V23     // key_len = 14
+block128_last:
+       // vcipher encryptions are in V15-V22 at this
+       // point with vcipherlast remaining to be done.
+       // Load input block into V1-V8, setting index offsets
+       // in R16-R22 to use with the STORE.
+       LOAD_INPUT_BLOCK128(BLK_INP)
+       // Do VCIPHERLAST on the last key for each encryption
+       // stream and XOR the result with the corresponding
+       // value from the input block.
+       VCIPHERLAST8_XOR_INPUT
+       // Store the results (8*16) and update BLK_OUT by 128.
+       STORE_OUTPUT_BLOCK128(BLK_OUT)
+       ADD     $-128, IN_LEN   // input size
+       CMP     IN_LEN, $128    // check if >= blocksize
+       BGE     block128_loop   // next input block
+       CMP     IN_LEN, $0
+       BEQ     done
+block64:
+       CMP     IN_LEN, $64     // Check if >= 64
+       BLT     block16_loop
+       // Do 4 encryptions in parallel by setting
+       // input values in V15-V18 and executing
+       // vcipher on the updated value and the keys.
+       GEN_VCIPHER_4_INPUTS
+       VCIPHER_4X1_KEY(VS1)
+       VCIPHER_4X1_KEY(VS2)
+       VCIPHER_4X1_KEY(VS3)
+       VCIPHER_4X1_KEY(VS4)
+       VCIPHER_4X1_KEY(VS5)
+       VCIPHER_4X1_KEY(VS6)
+       VCIPHER_4X1_KEY(VS7)
+       VCIPHER_4X1_KEY(VS8)
+       VCIPHER_4X1_KEY(VS9)
+       // Check key length based on CR2
+       // Move last key to V23 for use with later vcipherlast
+       XXLOR   VS10, VS10, V23
+       BLT     CR2, block64_last       // size = 10
+       VCIPHER_4X1_KEY(VS10)           // Encrypt next 2 keys
+       VCIPHER_4X1_KEY(VS11)
+       XXLOR   VS12, VS12, V23
+       BEQ     CR2, block64_last       // size = 12
+       VCIPHER_4X1_KEY(VS12)           // Encrypt last 2 keys
+       VCIPHER_4X1_KEY(VS13)
+       XXLOR   VS14, VS14, V23         // size = 14
+block64_last:
+       LOAD_INPUT_BLOCK64(BLK_INP)     // Load 64 bytes of input
+       // Do VCIPHERLAST on the last for each encryption
+       // stream and XOR the result with the corresponding
+       // value from the input block.
+       VCIPHERLAST4_XOR_INPUT
+       // Store the results (4*16) and update BLK_OUT by 64.
+       STORE_OUTPUT_BLOCK64(BLK_OUT)
+       ADD     $-64, IN_LEN            // decrement input block length
+       CMP     IN_LEN, $0              // check for remaining length
+       BEQ     done
+block16_loop:
+       CMP     IN_LEN, $16             // More input
+       BLT     final_block             // If not, then handle partial block
+       // Single encryption, no stitching
+       GEN_VCIPHER_INPUT               // Generate input value for single encryption
+       VCIPHER_1X9_KEYS(V15)           // Encrypt V15 value with 9 keys
+       XXLOR   VS10, VS10, V23         // Last key -> V23 for later vcipiherlast
+       // Key length based on CR2. (LT=10, EQ=12, GT=14)
+       BLT     CR2, block16_last       // Finish for key size 10
+       VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys
+       XXLOR   VS12, VS12, V23         // Last key -> V23 for later vcipherlast
+       BEQ     CR2, block16_last       // Finish for key size 12
+       VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
+       XXLOR   VS14, VS14, V23         // Last key -> V23 for vcipherlast with key size 14
+block16_last:
+       P8_LXVB16X(BLK_INP, R0, V1)     // Load input
+       VCIPHERLAST V15, V23, V15       // Encrypt last value in V23
+       XXLXOR  V15, V1, V1             // XOR with input
+       P8_STXVB16X(V1,R0,BLK_OUT)      // Store final encryption value to output
+       ADD     $16, BLK_INP            // Increment input pointer
+       ADD     $16, BLK_OUT            // Increment output pointer
+       ADD     $-16, IN_LEN            // Decrement input length
+       BR      block16_loop            // Check for next
+final_block:
+       CMP     IN_LEN, $0
+       BEQ     done
+       GEN_VCIPHER_INPUT               // Generate input value for partial encryption
+       VCIPHER_1X9_KEYS(V15)           // Encrypt V15 with 9 keys
+       XXLOR   VS10, VS10, V23         // Save possible last key
+       BLT     CR2, final_block_last
+       VCIPHER_1X2_KEYS(V15, VS10, VS11)       // Encrypt V15 with next 2 keys
+       XXLOR   VS12, VS12, V23         // Save possible last key
+       BEQ     CR2, final_block_last
+       VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
+       XXLOR   VS14, VS14, V23         // Save last key
+final_block_last:
+       VCIPHERLAST V15, V23, V15       // Finish encryption
+#ifdef GOPPC64_power10
+       // set up length
+       SLD     $56, IN_LEN, R17
+       LXVLL   BLK_INP, R17, V25
+       VXOR    V25, V15, V25
+       STXVLL  V25, BLK_OUT, R17
+#else
+       ADD     $32, R1, MASK_PTR
+       MOVD    $0, R16
+       P8_STXVB16X(V15, MASK_PTR, R0)
+       CMP     IN_LEN, $8
+       BLT     next4
+       MOVD    0(MASK_PTR), R14
+       MOVD    0(BLK_INP), R15
+       XOR     R14, R15, R14
+       MOVD    R14, 0(BLK_OUT)
+       ADD     $8, R16
+       ADD     $-8, IN_LEN
+next4:
+       CMP     IN_LEN, $4
+       BLT     next2
+       MOVWZ   (BLK_INP)(R16), R15
+       MOVWZ   (MASK_PTR)(R16), R14
+       XOR     R14, R15, R14
+       MOVW    R14, (R16)(BLK_OUT)
+       ADD     $4, R16
+       ADD     $-4, IN_LEN
+next2:
+       CMP     IN_LEN, $2
+       BLT     next1
+       MOVHZ   (BLK_INP)(R16), R15
+       MOVHZ   (MASK_PTR)(R16), R14
+       XOR     R14, R15, R14
+       MOVH    R14, (R16)(BLK_OUT)
+       ADD     $2, R16
+       ADD     $-2, IN_LEN
+next1:
+       CMP     IN_LEN, $1
+       BLT     done
+       MOVBZ   (MASK_PTR)(R16), R14
+       MOVBZ   (BLK_INP)(R16), R15
+       XOR     R14, R15, R14
+       MOVB    R14, (R16)(BLK_OUT)
+#endif
+done:
+       // Save the updated counter value
+       P8_STXVB16X(V30, COUNTER, R0)
+       // Clear the keys
+       XXLXOR  VS0, VS0, VS0
+       XXLXOR  VS1, VS1, VS1
+       XXLXOR  VS2, VS2, VS2
+       XXLXOR  VS3, VS3, VS3
+       XXLXOR  VS4, VS4, VS4
+       XXLXOR  VS5, VS5, VS5
+       XXLXOR  VS6, VS6, VS6
+       XXLXOR  VS7, VS7, VS7
+       XXLXOR  VS8, VS8, VS8
+       XXLXOR  VS9, VS9, VS9
+       XXLXOR  VS10, VS10, VS10
+       XXLXOR  VS11, VS11, VS11
+       XXLXOR  VS12, VS12, VS12
+       XXLXOR  VS13, VS13, VS13
+       XXLXOR  VS14, VS14, VS14
+       RET
+
author	Lynn Boger <laboger@linux.vnet.ibm.com>
	Tue, 11 Apr 2023 19:49:29 +0000 (14:49 -0500)
committer	Lynn Boger <laboger@linux.vnet.ibm.com>
	Mon, 9 Oct 2023 18:53:44 +0000 (18:53 +0000)
src/crypto/aes/gcm_ppc64x.go		patch \| blob \| history
src/crypto/aes/gcm_ppc64x.s		patch \| blob \| history