#define OUTHEAD V10
#define OUTTAIL V11
+// For P9 instruction emulation
+#define ESPERM V21 // Endian swapping permute into BE
+#define TMP2 V22 // Temporary for P8_STXVB16X/P8_STXV
+
// For {en,de}cryptBlockAsm
#define BLK_INP R3
#define BLK_OUT R4
#define BLK_ROUNDS R6
#define BLK_IDX R7
-DATA ·rcon+0x00(SB)/8, $0x0100000001000000 // RCON
-DATA ·rcon+0x08(SB)/8, $0x0100000001000000 // RCON
-DATA ·rcon+0x10(SB)/8, $0x1b0000001b000000
-DATA ·rcon+0x18(SB)/8, $0x1b0000001b000000
-DATA ·rcon+0x20(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
-DATA ·rcon+0x28(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
-DATA ·rcon+0x30(SB)/8, $0x0000000000000000
-DATA ·rcon+0x38(SB)/8, $0x0000000000000000
-GLOBL ·rcon(SB), RODATA, $64
+DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
+DATA ·rcon+0x08(SB)/8, $0x0706050403020100
+DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
+DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
+DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
+DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
+DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
+DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
+DATA ·rcon+0x40(SB)/8, $0x0000000000000000
+DATA ·rcon+0x48(SB)/8, $0x0000000000000000
+GLOBL ·rcon(SB), RODATA, $80
+
+// Emulate unaligned BE vector load/stores on LE targets
+#define P8_LXVB16X(RA,RB,VT) \
+ LXVD2X (RA+RB), VT \
+ VPERM VT, VT, ESPERM, VT
+
+#define P8_STXVB16X(VS,RA,RB) \
+ VPERM VS, VS, ESPERM, TMP2 \
+ STXVD2X TMP2, (RA+RB)
+
+#define P8_STXV(VS,RA,RB) \
+ XXPERMDI VS, VS, $2, TMP2 \
+ STXVD2X TMP2, (RA+RB)
+
+#define P8_LXV(RA,RB,VT) \
+ LXVD2X (RA+RB), VT \
+ XXPERMDI VT, VT, $2, VT
+
+#define LXSDX_BE(RA,RB,VT) \
+ LXSDX (RA+RB), VT \
+ VPERM VT, VT, ESPERM, VT
// func setEncryptKeyAsm(key *byte, keylen int, enc *uint32) int
TEXT ·setEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
BC 0x06, 2, enc_key_abort // bne- .Lenc_key_abort
MOVD $·rcon(SB), PTR // PTR point to rcon addr
+ LVX (PTR), ESPERM
+ ADD $0x10, PTR
// Get key from memory and write aligned into VR
- NEG INP, R9 // neg 9,3 R9 is ~INP + 1
- LVX (INP)(R0), IN0 // lvx 1,0,3 Load key inside IN0
- ADD $15, INP, INP // addi 3,3,15 Add 15B to INP addr
- LVSR (R9)(R0), KEY // lvsr 3,0,9
+ P8_LXVB16X(INP, R0, IN0)
+ ADD $0x10, INP, INP
MOVD $0x20, R8 // li 8,0x20 R8 = 32
+
CMPW BITS, $192 // cmpwi 4,192 Key size == 192?
- LVX (INP)(R0), IN1 // lvx 2,0,3
- VSPLTISB $0x0f, MASK// vspltisb 5,0x0f 0x0f0f0f0f... mask
LVX (PTR)(R0), RCON // lvx 4,0,6 Load first 16 bytes into RCON
- VXOR KEY, MASK, KEY // vxor 3,3,5 Adjust for byte swap
LVX (PTR)(R8), MASK // lvx 5,8,6
ADD $0x10, PTR, PTR // addi 6,6,0x10 PTR to next 16 bytes of RCON
- VPERM IN0, IN1, KEY, IN0 // vperm 1,1,2,3 Align
MOVD $8, CNT // li 7,8 CNT = 8
VXOR ZERO, ZERO, ZERO // vxor 0,0,0 Zero to be zero :)
MOVD CNT, CTR // mtctr 7 Set the counter to 8 (rounds)
- LVSL (OUT)(R0), OUTPERM // lvsl 8,0,5
- VSPLTISB $-1, OUTMASK // vspltisb 9,-1
- LVX (OUT)(R0), OUTHEAD // lvx 10,0,5
- VPERM OUTMASK, ZERO, OUTPERM, OUTMASK // vperm 9,9,0,8
-
BLT loop128 // blt .Loop128
- ADD $8, INP, INP // addi 3,3,8
BEQ l192 // beq .L192
- ADD $8, INP, INP // addi 3,3,8
JMP l256 // b .L256
loop128:
// Key schedule (Round 1 to 8)
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(IN0, R0, OUT)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
- STVX STAGE, (OUT+R0) // stvx 7,0,5 Write to output
ADD $16, OUT, OUT // addi 5,5,16 Point to the next round
VXOR IN0, TMP, IN0 // vxor 1,1,6
// Key schedule (Round 9)
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-spat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(IN0, R0, OUT)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
- STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 9
ADD $16, OUT, OUT // addi 5,5,16
// Key schedule (Round 10)
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(IN0, R0, OUT)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
- STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 10
ADD $16, OUT, OUT // addi 5,5,16
// Key schedule (Round 11)
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VXOR IN0, KEY, IN0 // vxor 1,1,3
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
- STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 11
+ P8_STXV(IN0, R0, OUT)
- ADD $15, OUT, INP // addi 3,5,15
ADD $0x50, OUT, OUT // addi 5,5,0x50
MOVD $10, ROUNDS // li 8,10
JMP done // b .Ldone
l192:
- LVX (INP)(R0), TMP // lvx 6,0,3
+ LXSDX_BE(INP, R0, IN1) // Load next 8 bytes into upper half of VSR in BE order.
MOVD $4, CNT // li 7,4
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
- STVX STAGE, (OUT+R0) // stvx 7,0,5
+ P8_STXV(IN0, R0, OUT)
ADD $16, OUT, OUT // addi 5,5,16
- VPERM IN1, TMP, KEY, IN1 // vperm 2,2,6,3
VSPLTISB $8, KEY // vspltisb 3,8
MOVD CNT, CTR // mtctr 7
VSUBUBM MASK, KEY, MASK // vsububm 5,5,3
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
- VPERM STAGE, STAGE, OUTPERM, OUTTAIL // vperm 11,7,7,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(STAGE, R0, OUT)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
- STVX STAGE, (OUT+R0) // stvx 7,0,5
ADD $16, OUT, OUT // addi 5,5,16
VSLDOI $8, IN0, IN1, STAGE // vsldoi 7,1,2,8
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
- VPERM STAGE, STAGE, OUTPERM, OUTTAIL // vperm 11,7,7,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(STAGE, R0, OUT)
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
- STVX STAGE, (OUT+R0) // stvx 7,0,5
ADD $16, OUT, OUT // addi 5,5,16
VSPLTW $3, IN0, TMP // vspltw 6,1,3
VXOR IN1, TMP, IN1 // vxor 2,2,6
VXOR IN0, KEY, IN0 // vxor 1,1,3
VXOR IN1, KEY, IN1 // vxor 2,2,3
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
- STVX STAGE, (OUT+R0) // stvx 7,0,5
- ADD $15, OUT, INP // addi 3,5,15
+ P8_STXV(IN0, R0, OUT)
ADD $16, OUT, OUT // addi 5,5,16
BC 0x10, 0, loop192 // bdnz .Loop192
BR done // b .Ldone
l256:
- LVX (INP)(R0), TMP // lvx 6,0,3
+ P8_LXVB16X(INP, R0, IN1)
MOVD $7, CNT // li 7,7
MOVD $14, ROUNDS // li 8,14
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
- STVX STAGE, (OUT+R0) // stvx 7,0,5
+ P8_STXV(IN0, R0, OUT)
ADD $16, OUT, OUT // addi 5,5,16
- VPERM IN1, TMP, KEY, IN1 // vperm 2,2,6,3
MOVD CNT, CTR // mtctr 7
loop256:
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
- VPERM IN1, IN1, OUTPERM, OUTTAIL // vperm 11,2,2,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
+ P8_STXV(IN1, R0, OUT)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
- STVX STAGE, (OUT+R0) // stvx 7,0,5
ADD $16, OUT, OUT // addi 5,5,16
VXOR IN0, TMP, IN0 // vxor 1,1,6
VXOR IN0, TMP, IN0 // vxor 1,1,6
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN0, KEY, IN0 // vxor 1,1,3
- VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
- VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
- VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
- STVX STAGE, (OUT+R0) // stvx 7,0,5
- ADD $15, OUT, INP // addi 3,5,15
+ P8_STXV(IN0, R0, OUT)
ADD $16, OUT, OUT // addi 5,5,16
BC 0x12, 0, done // bdz .Ldone
JMP loop256 // b .Loop256
done:
- LVX (INP)(R0), IN1 // lvx 2,0,3
- VSEL OUTHEAD, IN1, OUTMASK, IN1 // vsel 2,10,2,9
- STVX IN1, (INP+R0) // stvx 2,0,3
MOVD $0, PTR // li 6,0 set PTR to 0 (exit code 0)
MOVW ROUNDS, 0(OUT) // stw 8,0(5)