From 8d581f589eb9b9eac05c75f0dfe82a49c3afcd2f Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Thu, 24 Mar 2022 10:09:13 -0500 Subject: [PATCH] crypto/aes: simplify key expansion in ppc64le asm The ported cryptogam implementation uses a subtle and tricky mechanism using lxv/vperm/lvsl to load unaligned vectors. This is difficult to read, and may read and write unrelated bytes if reading from an unaligned address. Instead, POWER8 instructions can be used to load from unaligned memory with much less overhead. Alignment interrupts only occur when reading or writing cache-inhibited memory, which we assume isn't used in go today, otherwise alignment penalties are usually marginal. Instead lxvd2x+xxpermdi and xxpermdi+stxvd2x can be used to emulate unaligned LE bytewise loads, similar to lxv/stxv on POWER9 in little-endian mode. Likewise, a custom permute vector is used to emulate BE bytewise storage operations, lxvb16x/stxvb16x, on POWER9. This greatly simplifies the code, and it makes it much easier to store the keys in reverse (which is exactly how the decrypt keys are expected to be stored). Change-Id: I2334337e31a8fdf8d13ba96231142a039f237098 Reviewed-on: https://go-review.googlesource.com/c/go/+/395494 Reviewed-by: Lynn Boger Trust: Paul Murphy Run-TryBot: Paul Murphy TryBot-Result: Gopher Robot --- src/crypto/aes/asm_ppc64le.s | 132 ++++++++++++++--------------------- 1 file changed, 54 insertions(+), 78 deletions(-) diff --git a/src/crypto/aes/asm_ppc64le.s b/src/crypto/aes/asm_ppc64le.s index 5eae675322..192a8096cd 100644 --- a/src/crypto/aes/asm_ppc64le.s +++ b/src/crypto/aes/asm_ppc64le.s @@ -43,6 +43,10 @@ #define OUTHEAD V10 #define OUTTAIL V11 +// For P9 instruction emulation +#define ESPERM V21 // Endian swapping permute into BE +#define TMP2 V22 // Temporary for P8_STXVB16X/P8_STXV + // For {en,de}cryptBlockAsm #define BLK_INP R3 #define BLK_OUT R4 @@ -50,15 +54,38 @@ #define BLK_ROUNDS R6 #define BLK_IDX R7 -DATA ·rcon+0x00(SB)/8, $0x0100000001000000 // RCON -DATA ·rcon+0x08(SB)/8, $0x0100000001000000 // RCON -DATA ·rcon+0x10(SB)/8, $0x1b0000001b000000 -DATA ·rcon+0x18(SB)/8, $0x1b0000001b000000 -DATA ·rcon+0x20(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK -DATA ·rcon+0x28(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK -DATA ·rcon+0x30(SB)/8, $0x0000000000000000 -DATA ·rcon+0x38(SB)/8, $0x0000000000000000 -GLOBL ·rcon(SB), RODATA, $64 +DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap +DATA ·rcon+0x08(SB)/8, $0x0706050403020100 +DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON +DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON +DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000 +DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000 +DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK +DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK +DATA ·rcon+0x40(SB)/8, $0x0000000000000000 +DATA ·rcon+0x48(SB)/8, $0x0000000000000000 +GLOBL ·rcon(SB), RODATA, $80 + +// Emulate unaligned BE vector load/stores on LE targets +#define P8_LXVB16X(RA,RB,VT) \ + LXVD2X (RA+RB), VT \ + VPERM VT, VT, ESPERM, VT + +#define P8_STXVB16X(VS,RA,RB) \ + VPERM VS, VS, ESPERM, TMP2 \ + STXVD2X TMP2, (RA+RB) + +#define P8_STXV(VS,RA,RB) \ + XXPERMDI VS, VS, $2, TMP2 \ + STXVD2X TMP2, (RA+RB) + +#define P8_LXV(RA,RB,VT) \ + LXVD2X (RA+RB), VT \ + XXPERMDI VT, VT, $2, VT + +#define LXSDX_BE(RA,RB,VT) \ + LXSDX (RA+RB), VT \ + VPERM VT, VT, ESPERM, VT // func setEncryptKeyAsm(key *byte, keylen int, enc *uint32) int TEXT ·setEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0 @@ -87,45 +114,32 @@ TEXT ·doEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0 BC 0x06, 2, enc_key_abort // bne- .Lenc_key_abort MOVD $·rcon(SB), PTR // PTR point to rcon addr + LVX (PTR), ESPERM + ADD $0x10, PTR // Get key from memory and write aligned into VR - NEG INP, R9 // neg 9,3 R9 is ~INP + 1 - LVX (INP)(R0), IN0 // lvx 1,0,3 Load key inside IN0 - ADD $15, INP, INP // addi 3,3,15 Add 15B to INP addr - LVSR (R9)(R0), KEY // lvsr 3,0,9 + P8_LXVB16X(INP, R0, IN0) + ADD $0x10, INP, INP MOVD $0x20, R8 // li 8,0x20 R8 = 32 + CMPW BITS, $192 // cmpwi 4,192 Key size == 192? - LVX (INP)(R0), IN1 // lvx 2,0,3 - VSPLTISB $0x0f, MASK// vspltisb 5,0x0f 0x0f0f0f0f... mask LVX (PTR)(R0), RCON // lvx 4,0,6 Load first 16 bytes into RCON - VXOR KEY, MASK, KEY // vxor 3,3,5 Adjust for byte swap LVX (PTR)(R8), MASK // lvx 5,8,6 ADD $0x10, PTR, PTR // addi 6,6,0x10 PTR to next 16 bytes of RCON - VPERM IN0, IN1, KEY, IN0 // vperm 1,1,2,3 Align MOVD $8, CNT // li 7,8 CNT = 8 VXOR ZERO, ZERO, ZERO // vxor 0,0,0 Zero to be zero :) MOVD CNT, CTR // mtctr 7 Set the counter to 8 (rounds) - LVSL (OUT)(R0), OUTPERM // lvsl 8,0,5 - VSPLTISB $-1, OUTMASK // vspltisb 9,-1 - LVX (OUT)(R0), OUTHEAD // lvx 10,0,5 - VPERM OUTMASK, ZERO, OUTPERM, OUTMASK // vperm 9,9,0,8 - BLT loop128 // blt .Loop128 - ADD $8, INP, INP // addi 3,3,8 BEQ l192 // beq .L192 - ADD $8, INP, INP // addi 3,3,8 JMP l256 // b .L256 loop128: // Key schedule (Round 1 to 8) VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 - VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 + P8_STXV(IN0, R0, OUT) VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 - STVX STAGE, (OUT+R0) // stvx 7,0,5 Write to output ADD $16, OUT, OUT // addi 5,5,16 Point to the next round VXOR IN0, TMP, IN0 // vxor 1,1,6 @@ -142,11 +156,8 @@ loop128: // Key schedule (Round 9) VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-spat VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 - VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 + P8_STXV(IN0, R0, OUT) VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 - STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 9 ADD $16, OUT, OUT // addi 5,5,16 // Key schedule (Round 10) @@ -160,11 +171,8 @@ loop128: VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 - VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 + P8_STXV(IN0, R0, OUT) VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 - STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 10 ADD $16, OUT, OUT // addi 5,5,16 // Key schedule (Round 11) @@ -174,26 +182,18 @@ loop128: VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 VXOR IN0, TMP, IN0 // vxor 1,1,6 VXOR IN0, KEY, IN0 // vxor 1,1,3 - VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 - STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 11 + P8_STXV(IN0, R0, OUT) - ADD $15, OUT, INP // addi 3,5,15 ADD $0x50, OUT, OUT // addi 5,5,0x50 MOVD $10, ROUNDS // li 8,10 JMP done // b .Ldone l192: - LVX (INP)(R0), TMP // lvx 6,0,3 + LXSDX_BE(INP, R0, IN1) // Load next 8 bytes into upper half of VSR in BE order. MOVD $4, CNT // li 7,4 - VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 - STVX STAGE, (OUT+R0) // stvx 7,0,5 + P8_STXV(IN0, R0, OUT) ADD $16, OUT, OUT // addi 5,5,16 - VPERM IN1, TMP, KEY, IN1 // vperm 2,2,6,3 VSPLTISB $8, KEY // vspltisb 3,8 MOVD CNT, CTR // mtctr 7 VSUBUBM MASK, KEY, MASK // vsububm 5,5,3 @@ -221,23 +221,17 @@ loop192: VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5 VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 - VPERM STAGE, STAGE, OUTPERM, OUTTAIL // vperm 11,7,7,8 - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 + P8_STXV(STAGE, R0, OUT) VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 - STVX STAGE, (OUT+R0) // stvx 7,0,5 ADD $16, OUT, OUT // addi 5,5,16 VSLDOI $8, IN0, IN1, STAGE // vsldoi 7,1,2,8 VXOR IN0, TMP, IN0 // vxor 1,1,6 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 - VPERM STAGE, STAGE, OUTPERM, OUTTAIL // vperm 11,7,7,8 - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 + P8_STXV(STAGE, R0, OUT) VXOR IN0, TMP, IN0 // vxor 1,1,6 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 VXOR IN0, TMP, IN0 // vxor 1,1,6 - STVX STAGE, (OUT+R0) // stvx 7,0,5 ADD $16, OUT, OUT // addi 5,5,16 VSPLTW $3, IN0, TMP // vspltw 6,1,3 @@ -247,11 +241,7 @@ loop192: VXOR IN1, TMP, IN1 // vxor 2,2,6 VXOR IN0, KEY, IN0 // vxor 1,1,3 VXOR IN1, KEY, IN1 // vxor 2,2,3 - VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 - STVX STAGE, (OUT+R0) // stvx 7,0,5 - ADD $15, OUT, INP // addi 3,5,15 + P8_STXV(IN0, R0, OUT) ADD $16, OUT, OUT // addi 5,5,16 BC 0x10, 0, loop192 // bdnz .Loop192 @@ -260,25 +250,18 @@ loop192: BR done // b .Ldone l256: - LVX (INP)(R0), TMP // lvx 6,0,3 + P8_LXVB16X(INP, R0, IN1) MOVD $7, CNT // li 7,7 MOVD $14, ROUNDS // li 8,14 - VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 - STVX STAGE, (OUT+R0) // stvx 7,0,5 + P8_STXV(IN0, R0, OUT) ADD $16, OUT, OUT // addi 5,5,16 - VPERM IN1, TMP, KEY, IN1 // vperm 2,2,6,3 MOVD CNT, CTR // mtctr 7 loop256: VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5 VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 - VPERM IN1, IN1, OUTPERM, OUTTAIL // vperm 11,2,2,8 - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 + P8_STXV(IN1, R0, OUT) VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 - STVX STAGE, (OUT+R0) // stvx 7,0,5 ADD $16, OUT, OUT // addi 5,5,16 VXOR IN0, TMP, IN0 // vxor 1,1,6 @@ -288,11 +271,7 @@ loop256: VXOR IN0, TMP, IN0 // vxor 1,1,6 VADDUWM RCON, RCON, RCON // vadduwm 4,4,4 VXOR IN0, KEY, IN0 // vxor 1,1,3 - VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 - VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9 - VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11 - STVX STAGE, (OUT+R0) // stvx 7,0,5 - ADD $15, OUT, INP // addi 3,5,15 + P8_STXV(IN0, R0, OUT) ADD $16, OUT, OUT // addi 5,5,16 BC 0x12, 0, done // bdz .Ldone @@ -310,9 +289,6 @@ loop256: JMP loop256 // b .Loop256 done: - LVX (INP)(R0), IN1 // lvx 2,0,3 - VSEL OUTHEAD, IN1, OUTMASK, IN1 // vsel 2,10,2,9 - STVX IN1, (INP+R0) // stvx 2,0,3 MOVD $0, PTR // li 6,0 set PTR to 0 (exit code 0) MOVW ROUNDS, 0(OUT) // stw 8,0(5) -- 2.48.1