MOVD dec+24(FP), OUTDEC
#ifdef GOARCH_ppc64le
- MOVD $·rcon(SB), PTR // PTR point to rcon addr
+ MOVD $·rcon(SB), PTR // PTR points to rcon addr
LVX (PTR), ESPERM
ADD $0x10, PTR
#else
- MOVD $·rcon+0x10(SB), PTR // PTR point to rcon addr (skipping permute vector)
+ MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
#endif
// Get key from memory and write aligned into VR
#undef KEY
#undef TMP
-// CBC encrypt or decrypt
-// R3 src
-// R4 dst
-// R5 len
-// R6 key
-// R7 iv
-// R8 enc=1 dec=0
-// Ported from: aes_p8_cbc_encrypt
-// Register usage:
-// R9: ROUNDS
-// R10: Index
-// V4: IV
-// V5: SRC
-// V7: DST
-
#define INP R3
-#define OUT R4
+#define OUTP R4
#define LEN R5
-#define KEY R6
-#define IVP R7
-#define ENC R8
-#define ROUNDS R9
-#define IDX R10
+#define KEYP R6
+#define ROUNDS R7
+#define IVP R8
+#define ENC R9
-#define RNDKEY0 V0
#define INOUT V2
#define TMP V3
-
#define IVEC V4
-// Vector loads are done using LVX followed by
-// a VPERM using mask generated from previous
-// LVSL or LVSR instruction, to obtain the correct
-// bytes if address is unaligned.
-
-// Encryption is done with VCIPHER and VCIPHERLAST
-// Decryption is done with VNCIPHER and VNCIPHERLAST
-
-// Encrypt and decypt is done as follows:
-// - INOUT value is initialized in outer loop.
-// - ROUNDS value is adjusted for loop unrolling.
-// - Encryption/decryption is done in loop based on
-// adjusted ROUNDS value.
-// - Final INOUT value is encrypted/decrypted and stored.
-
-// Note: original implementation had an 8X version
-// for decryption which was omitted to avoid the
-// complexity.
-
-// func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
+// Load the crypt key into VSRs.
+//
+// The expanded key is stored and loaded using
+// STXVD2X/LXVD2X. The in-memory byte ordering
+// depends on the endianness of the machine. The
+// expanded keys are generated by expandKeyAsm above.
+//
+// Rkeyp holds the key pointer. It is clobbered. Once
+// the expanded keys are loaded, it is not needed.
+//
+// R12,R14-R21 are scratch registers.
+// For keyp of 10, V6, V11-V20 hold the expanded key.
+// For keyp of 12, V6, V9-V20 hold the expanded key.
+// For keyp of 14, V6, V7-V20 hold the expanded key.
+#define LOAD_KEY(Rkeyp) \
+ MOVD $16, R12 \
+ MOVD $32, R14 \
+ MOVD $48, R15 \
+ MOVD $64, R16 \
+ MOVD $80, R17 \
+ MOVD $96, R18 \
+ MOVD $112, R19 \
+ MOVD $128, R20 \
+ MOVD $144, R21 \
+ LXVD2X (R0+Rkeyp), V6 \
+ ADD $16, Rkeyp \
+ BEQ CR1, L_start10 \
+ BEQ CR2, L_start12 \
+ LXVD2X (R0+Rkeyp), V7 \
+ LXVD2X (R12+Rkeyp), V8 \
+ ADD $32, Rkeyp \
+ L_start12: \
+ LXVD2X (R0+Rkeyp), V9 \
+ LXVD2X (R12+Rkeyp), V10 \
+ ADD $32, Rkeyp \
+ L_start10: \
+ LXVD2X (R0+Rkeyp), V11 \
+ LXVD2X (R12+Rkeyp), V12 \
+ LXVD2X (R14+Rkeyp), V13 \
+ LXVD2X (R15+Rkeyp), V14 \
+ LXVD2X (R16+Rkeyp), V15 \
+ LXVD2X (R17+Rkeyp), V16 \
+ LXVD2X (R18+Rkeyp), V17 \
+ LXVD2X (R19+Rkeyp), V18 \
+ LXVD2X (R20+Rkeyp), V19 \
+ LXVD2X (R21+Rkeyp), V20
+
+// Perform aes cipher operation for keysize 10/12/14 using the keys
+// loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ.
+//
+// Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting
+// performance V6 and IVEC can be swapped (xor is both associative and
+// commutative) during encryption:
+//
+// VXOR INOUT, IVEC, INOUT
+// VXOR INOUT, V6, INOUT
+//
+// into
+//
+// VXOR INOUT, V6, INOUT
+// VXOR INOUT, IVEC, INOUT
+//
+#define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \
+ VXOR Vin, Vxor, Vout \
+ BEQ CR1, label10 \
+ BEQ CR2, label12 \
+ vcipher Vout, V7, Vout \
+ vcipher Vout, V8, Vout \
+ label12: \
+ vcipher Vout, V9, Vout \
+ vcipher Vout, V10, Vout \
+ label10: \
+ vcipher Vout, V11, Vout \
+ vcipher Vout, V12, Vout \
+ vcipher Vout, V13, Vout \
+ vcipher Vout, V14, Vout \
+ vcipher Vout, V15, Vout \
+ vcipher Vout, V16, Vout \
+ vcipher Vout, V17, Vout \
+ vcipher Vout, V18, Vout \
+ vcipher Vout, V19, Vout \
+ vciphel Vout, V20, Vout \
+
+#define CLEAR_KEYS() \
+ VXOR V6, V6, V6 \
+ VXOR V7, V7, V7 \
+ VXOR V8, V8, V8 \
+ VXOR V9, V9, V9 \
+ VXOR V10, V10, V10 \
+ VXOR V11, V11, V11 \
+ VXOR V12, V12, V12 \
+ VXOR V13, V13, V13 \
+ VXOR V14, V14, V14 \
+ VXOR V15, V15, V15 \
+ VXOR V16, V16, V16 \
+ VXOR V17, V17, V17 \
+ VXOR V18, V18, V18 \
+ VXOR V19, V19, V19 \
+ VXOR V20, V20, V20
+
+//func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
MOVD src+0(FP), INP
- MOVD dst+8(FP), OUT
+ MOVD dst+8(FP), OUTP
MOVD length+16(FP), LEN
- MOVD key+24(FP), KEY
+ MOVD key+24(FP), KEYP
MOVD iv+32(FP), IVP
MOVD enc+40(FP), ENC
MOVD nr+48(FP), ROUNDS
LVX (R11), ESPERM // Permute value for P8_ macros.
#endif
- CMPU LEN, $16 // cmpldi r5,16
- BC 14, 0, LR // bltlr-, return if len < 16.
- CMPW ENC, $0 // cmpwi r8,0
+ // Assume len > 0 && len % blockSize == 0.
+ CMPW ENC, $0
+ P8_LXVB16X(IVP, R0, IVEC)
+ CMPU ROUNDS, $10, CR1
+ CMPU ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported.
- P8_LXVB16X(IVP, R0, IVEC) // load ivec in BE register order
+ // Setup key in VSRs, and set loop count in CTR.
+ LOAD_KEY(KEYP)
+ SRD $4, LEN
+ MOVD LEN, CTR
- SRW $1, ROUNDS // rlwinm r9,r9,31,1,31
- MOVD $0, IDX // li r10,0
- ADD $-1, ROUNDS // addi r9,r9,-1
- BEQ Lcbc_dec // beq
- PCALIGN $16
+ BEQ Lcbc_dec
- // Outer loop: initialize encrypted value (INOUT)
- // Load input (INPTAIL) ivec (IVEC)
+ PCALIGN $32
Lcbc_enc:
- P8_LXVB16X(INP, R0, INOUT) // load text in BE vreg order
- ADD $16, INP // addi r3,r3,16
- MOVD ROUNDS, CTR // mtctr r9
- ADD $-16, LEN // addi r5,r5,-16
- LXVD2X (KEY+IDX), RNDKEY0 // load first xkey
- ADD $16, IDX // addi r10,r10,16
- VXOR INOUT, RNDKEY0, INOUT // vxor v2,v2,v0
- VXOR INOUT, IVEC, INOUT // vxor v2,v2,v4
-
- // Encryption loop of INOUT using RNDKEY0
-Loop_cbc_enc:
- LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
- VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v1
- ADD $16, IDX // addi r10,r10,16
- LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
- VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v1
- ADD $16, IDX // addi r10,r10,16
- BDNZ Loop_cbc_enc
-
- // Encrypt tail values and store INOUT
- LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
- VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v1
- ADD $16, IDX // addi r10,r10,16
- LXVD2X (KEY+IDX), RNDKEY0 // load final xkey
- VCIPHERLAST INOUT, RNDKEY0, IVEC // vcipherlast v4,v2,v0
- MOVD $0, IDX // reset key index for next block
- CMPU LEN, $16 // cmpldi r5,16
- P8_STXVB16X(IVEC, OUT, R0) // store ciphertext in BE order
- ADD $16, OUT // addi r4,r4,16
- BGE Lcbc_enc // bge Lcbc_enc
- BR Lcbc_done // b Lcbc_done
-
- // Outer loop: initialize decrypted value (INOUT)
- // Load input (INPTAIL) ivec (IVEC)
-Lcbc_dec:
- P8_LXVB16X(INP, R0, TMP) // load ciphertext in BE vreg order
- ADD $16, INP // addi r3,r3,16
- MOVD ROUNDS, CTR // mtctr r9
- ADD $-16, LEN // addi r5,r5,-16
- LXVD2X (KEY+IDX), RNDKEY0 // load first xkey
- ADD $16, IDX // addi r10,r10,16
- VXOR TMP, RNDKEY0, INOUT // vxor v2,v3,v0
- PCALIGN $16
-
- // Decryption loop of INOUT using RNDKEY0
-Loop_cbc_dec:
- LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
- ADD $16, IDX // addi r10,r10,16
- VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v1
- LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
- ADD $16, IDX // addi r10,r10,16
- VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v0
- BDNZ Loop_cbc_dec
-
- // Decrypt tail values and store INOUT
- LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
- ADD $16, IDX // addi r10,r10,16
- VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v1
- LXVD2X (KEY+IDX), RNDKEY0 // load final xkey
- MOVD $0, IDX // li r10,0
- VNCIPHERLAST INOUT, RNDKEY0, INOUT // vncipherlast v2,v2,v0
- CMPU LEN, $16 // cmpldi r5,16
- VXOR INOUT, IVEC, INOUT // vxor v2,v2,v4
- VOR TMP, TMP, IVEC // vor v4,v3,v3
- P8_STXVB16X(INOUT, OUT, R0) // store text in BE order
- ADD $16, OUT // addi r4,r4,16
- BGE Lcbc_dec // bge
-
-Lcbc_done:
- VXOR RNDKEY0, RNDKEY0, RNDKEY0 // clear key register
- P8_STXVB16X(IVEC, R0, IVP) // Save ivec in BE order for next round.
- RET // bclr 20,lt,0
+ P8_LXVB16X(INP, R0, INOUT)
+ ADD $16, INP
+ VXOR INOUT, V6, INOUT
+ CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12)
+ VOR INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block.
+ P8_STXVB16X(INOUT, OUTP, R0)
+ ADD $16, OUTP
+ BDNZ Lcbc_enc
+
+ P8_STXVB16X(INOUT, IVP, R0)
+ CLEAR_KEYS()
+ RET
+ PCALIGN $32
+Lcbc_dec:
+ P8_LXVB16X(INP, R0, TMP)
+ ADD $16, INP
+ CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12)
+ VXOR INOUT, IVEC, INOUT
+ VOR TMP, TMP, IVEC // TMP is IVEC for next block.
+ P8_STXVB16X(INOUT, OUTP, R0)
+ ADD $16, OUTP
+ BDNZ Lcbc_dec
+
+ P8_STXVB16X(IVEC, IVP, R0)
+ CLEAR_KEYS()
+ RET