]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/aes: enable aes-cbc asm for ppc64
authorPaul E. Murphy <murp@ibm.com>
Mon, 9 May 2022 15:44:07 +0000 (10:44 -0500)
committerPaul Murphy <murp@ibm.com>
Fri, 13 May 2022 14:14:26 +0000 (14:14 +0000)
This reworks how we load/store vector registers using the new
bi-endian P9 instruction emulation macros. This also removes
quite a bit of asm used to align and reorder vector registers.

This is also a slight improvement on P9 ppc64le/linux:

    name             old speed      new speed      delta
    AESCBCEncrypt1K   936MB/s ± 0%   943MB/s ± 0%  +0.80%
    AESCBCDecrypt1K  1.28GB/s ± 0%  1.37GB/s ± 0%  +6.76%

Updates #18499

Change-Id: Ic5ff71d217d7302b6ae4e8d877c25004bfda5ecd
Reviewed-on: https://go-review.googlesource.com/c/go/+/405134
Run-TryBot: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
src/crypto/aes/asm_ppc64x.s
src/crypto/aes/cbc_ppc64x.go [moved from src/crypto/aes/cbc_ppc64le.go with 98% similarity]

index 3513f09a93fc5666570aa01e5fa644363abbd6db..1ae69d2a2dc44ee0924a8e252c86274dfaadb68f 100644 (file)
@@ -513,10 +513,6 @@ Linvalid_key_len:
 #undef ROUNDS
 #undef KEY
 #undef TMP
-#undef OUTPERM
-#undef OUTMASK
-#undef OUTHEAD
-#undef OUTTAIL
 
 // CBC encrypt or decrypt
 // R3 src
@@ -529,13 +525,9 @@ Linvalid_key_len:
 // Register usage:
 // R9: ROUNDS
 // R10: Index
-// V0: initialized to 0
-// V3: initialized to mask
 // V4: IV
 // V5: SRC
-// V6: IV perm mask
 // V7: DST
-// V10: KEY perm mask
 
 #define INP R3
 #define OUT R4
@@ -547,17 +539,10 @@ Linvalid_key_len:
 #define IDX R10
 
 #define RNDKEY0 V0
-#define RNDKEY1 V1
 #define INOUT V2
 #define TMP V3
 
 #define IVEC V4
-#define INPTAIL V5
-#define INPPERM V6
-#define OUTHEAD V7
-#define OUTPERM V8
-#define OUTMASK V9
-#define KEYPERM V10
 
 // Vector loads are done using LVX followed by
 // a VPERM using mask generated from previous
@@ -588,32 +573,19 @@ TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
        MOVD    enc+40(FP), ENC
        MOVD    nr+48(FP), ROUNDS
 
-       CMPU    LEN, $16                  // cmpldi r5,16
-       BC      14, 0, LR                 // bltlr-
-       CMPW    ENC, $0                   // cmpwi r8,0
-       MOVD    $15, IDX                  // li r10,15
-       VXOR    RNDKEY0, RNDKEY0, RNDKEY0 // vxor v0,v0,v0
-       VSPLTISB        $0xf, TMP         // vspltisb $0xf,v3
-
-       LVX     (IVP)(R0), IVEC                    // lvx v4,r0,r7
-       LVSL    (IVP)(R0), INPPERM                 // lvsl v6,r0,r7
-       LVX     (IVP)(IDX), INPTAIL                // lvx v5,r10,r7
-       VXOR    INPPERM, TMP, INPPERM              // vxor v3, v6, v6
-       VPERM   IVEC, INPTAIL, INPPERM, IVEC       // vperm v4,v4,v5,v6
-       NEG     INP, R11                           // neg r11,r3
-       LVSR    (KEY)(R0), KEYPERM                 // lvsr v10,r0,r6
-       LVSR    (R11)(R0), V6                      // lvsr v6,r0,r11
-       LVX     (INP)(R0), INPTAIL                 // lvx v5,r0,r3
-       ADD     $15, INP                           // addi r3,r3,15
-       VXOR    INPPERM, TMP, INPPERM              // vxor v6, v3, v6
-       LVSL    (OUT)(R0), OUTPERM                 // lvsl v8,r0,r4
-       VSPLTISB        $-1, OUTMASK               // vspltisb v9,-1
-       LVX     (OUT)(R0), OUTHEAD                 // lvx v7,r0,r4
-       VPERM   OUTMASK, RNDKEY0, OUTPERM, OUTMASK // vperm v9,v9,v0,v8
-       VXOR    OUTPERM, TMP, OUTPERM              // vxor v8, v3, v8
-       SRW     $1, ROUNDS                         // rlwinm r9,r9,31,1,31
-
-       MOVD    $16, IDX    // li r10,16
+#ifdef GOARCH_ppc64le
+       MOVD    $·rcon(SB), R11
+       LVX     (R11), ESPERM   // Permute value for P8_ macros.
+#endif
+
+       CMPU    LEN, $16    // cmpldi r5,16
+       BC      14, 0, LR   // bltlr-, return if len < 16.
+       CMPW    ENC, $0     // cmpwi r8,0
+
+       P8_LXVB16X(IVP, R0, IVEC) // load ivec in BE register order
+
+       SRW     $1, ROUNDS  // rlwinm r9,r9,31,1,31
+       MOVD    $0, IDX     // li r10,0
        ADD     $-1, ROUNDS // addi r9,r9,-1
        BEQ     Lcbc_dec    // beq
        PCALIGN $16
@@ -621,45 +593,34 @@ TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
        // Outer loop: initialize encrypted value (INOUT)
        // Load input (INPTAIL) ivec (IVEC)
 Lcbc_enc:
-       VOR     INPTAIL, INPTAIL, INOUT            // vor v2,v5,v5
-       LVX     (INP)(R0), INPTAIL                 // lvx v5,r0,r3
+       P8_LXVB16X(INP, R0, INOUT)                 // load text in BE vreg order
        ADD     $16, INP                           // addi r3,r3,16
        MOVD    ROUNDS, CTR                        // mtctr r9
        ADD     $-16, LEN                          // addi r5,r5,-16
-       LVX     (KEY)(R0), RNDKEY0                 // lvx v0,r0,r6
-       VPERM   INOUT, INPTAIL, INPPERM, INOUT     // vperm v2,v2,v5,v6
-       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
+       P8_LXV(KEY, IDX, RNDKEY0)                  // load first xkey
        ADD     $16, IDX                           // addi r10,r10,16
-       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
        VXOR    INOUT, RNDKEY0, INOUT              // vxor v2,v2,v0
-       LVX     (KEY)(IDX), RNDKEY0                // lvx v0,r10,r6
-       ADD     $16, IDX                           // addi r10,r10,16
        VXOR    INOUT, IVEC, INOUT                 // vxor v2,v2,v4
 
-       // Encryption loop of INOUT using RNDKEY0 and RNDKEY1
+       // Encryption loop of INOUT using RNDKEY0
 Loop_cbc_enc:
-       VPERM   RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v1,v0,v10
-       VCIPHER INOUT, RNDKEY1, INOUT              // vcipher v2,v2,v1
-       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
+       P8_LXV(KEY, IDX, RNDKEY0)                  // load next xkey
+       VCIPHER INOUT, RNDKEY0, INOUT              // vcipher v2,v2,v1
        ADD     $16, IDX                           // addi r10,r10,16
-       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v0,v1,v10
-       VCIPHER INOUT, RNDKEY0, INOUT              // vcipher v2,v2,v0
-       LVX     (KEY)(IDX), RNDKEY0                // lvx v0,r10,r6
+       P8_LXV(KEY, IDX, RNDKEY0)                  // load next xkey
+       VCIPHER INOUT, RNDKEY0, INOUT              // vcipher v2,v2,v1
        ADD     $16, IDX                           // addi r10,r10,16
-       BC      16, 0, Loop_cbc_enc                // bdnz Loop_cbc_enc
+       BDNZ Loop_cbc_enc
 
        // Encrypt tail values and store INOUT
-       VPERM   RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v1,v0,v10
-       VCIPHER INOUT, RNDKEY1, INOUT              // vcipher v2,v2,v1
-       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
-       MOVD    $16, IDX                           // li r10,16
-       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v0,v1,v10
+       P8_LXV(KEY, IDX, RNDKEY0)                  // load next xkey
+       VCIPHER INOUT, RNDKEY0, INOUT              // vcipher v2,v2,v1
+       ADD     $16, IDX                           // addi r10,r10,16
+       P8_LXV(KEY, IDX, RNDKEY0)                  // load final xkey
        VCIPHERLAST     INOUT, RNDKEY0, IVEC       // vcipherlast v4,v2,v0
+       MOVD    $0, IDX                            // reset key index for next block
        CMPU    LEN, $16                           // cmpldi r5,16
-       VPERM   IVEC, IVEC, OUTPERM, TMP           // vperm v3,v4,v4,v8
-       VSEL    OUTHEAD, TMP, OUTMASK, INOUT       // vsel v2,v7,v3,v9
-       VOR     TMP, TMP, OUTHEAD                  // vor v7,v3,v3
-       STVX    INOUT, (OUT)(R0)                   // stvx v2,r0,r4
+       P8_STXVB16X(IVEC, OUT, R0)                 // store ciphertext in BE order
        ADD     $16, OUT                           // addi r4,r4,16
        BGE     Lcbc_enc                           // bge Lcbc_enc
        BR      Lcbc_done                          // b Lcbc_done
@@ -667,69 +628,41 @@ Loop_cbc_enc:
        // Outer loop: initialize decrypted value (INOUT)
        // Load input (INPTAIL) ivec (IVEC)
 Lcbc_dec:
-       VOR     INPTAIL, INPTAIL, TMP              // vor v3,v5,v5
-       LVX     (INP)(R0), INPTAIL                 // lvx v5,r0,r3
+       P8_LXVB16X(INP, R0, TMP)                   // load ciphertext in BE vreg order
        ADD     $16, INP                           // addi r3,r3,16
        MOVD    ROUNDS, CTR                        // mtctr r9
        ADD     $-16, LEN                          // addi r5,r5,-16
-       LVX     (KEY)(R0), RNDKEY0                 // lvx v0,r0,r6
-       VPERM   TMP, INPTAIL, INPPERM, TMP         // vperm v3,v3,v5,v6
-       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
+       P8_LXV(KEY, IDX, RNDKEY0)                  // load first xkey
        ADD     $16, IDX                           // addi r10,r10,16
-       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
        VXOR    TMP, RNDKEY0, INOUT                // vxor v2,v3,v0
-       LVX     (KEY)(IDX), RNDKEY0                // lvx v0,r10,r6
-       ADD     $16, IDX                           // addi r10,r10,16
        PCALIGN $16
 
-       // Decryption loop of INOUT using RNDKEY0 and RNDKEY1
+       // Decryption loop of INOUT using RNDKEY0
 Loop_cbc_dec:
-       VPERM   RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v0,v1,v10
-       VNCIPHER        INOUT, RNDKEY1, INOUT      // vncipher v2,v2,v1
-       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
+       P8_LXV(KEY, IDX, RNDKEY0)                  // load next xkey
        ADD     $16, IDX                           // addi r10,r10,16
-       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
-       VNCIPHER        INOUT, RNDKEY0, INOUT      // vncipher v2,v2,v0
-       LVX     (KEY)(IDX), RNDKEY0                // lvx v0,r10,r6
+       VNCIPHER        INOUT, RNDKEY0, INOUT      // vncipher v2,v2,v1
+       P8_LXV(KEY, IDX, RNDKEY0)                  // load next xkey
        ADD     $16, IDX                           // addi r10,r10,16
-       BC      16, 0, Loop_cbc_dec                // bdnz
+       VNCIPHER        INOUT, RNDKEY0, INOUT      // vncipher v2,v2,v0
+       BDNZ Loop_cbc_dec
 
        // Decrypt tail values and store INOUT
-       VPERM   RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v0,v1,v10
-       VNCIPHER        INOUT, RNDKEY1, INOUT      // vncipher v2,v2,v1
-       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
-       MOVD    $16, IDX                           // li r10,16
-       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
+       P8_LXV(KEY, IDX, RNDKEY0)                  // load next xkey
+       ADD     $16, IDX                           // addi r10,r10,16
+       VNCIPHER        INOUT, RNDKEY0, INOUT      // vncipher v2,v2,v1
+       P8_LXV(KEY, IDX, RNDKEY0)                  // load final xkey
+       MOVD    $0, IDX                            // li r10,0
        VNCIPHERLAST    INOUT, RNDKEY0, INOUT      // vncipherlast v2,v2,v0
        CMPU    LEN, $16                           // cmpldi r5,16
        VXOR    INOUT, IVEC, INOUT                 // vxor v2,v2,v4
        VOR     TMP, TMP, IVEC                     // vor v4,v3,v3
-       VPERM   INOUT, INOUT, OUTPERM, TMP         // vperm v3,v2,v2,v8
-       VSEL    OUTHEAD, TMP, OUTMASK, INOUT       // vsel v2,v7,v3,v9
-       VOR     TMP, TMP, OUTHEAD                  // vor v7,v3,v3
-       STVX    INOUT, (OUT)(R0)                   // stvx v2,r0,r4
+       P8_STXVB16X(INOUT, OUT, R0)                // store text in BE order
        ADD     $16, OUT                           // addi r4,r4,16
        BGE     Lcbc_dec                           // bge
 
 Lcbc_done:
-       ADD     $-1, OUT                           // addi r4,r4,-1
-       LVX     (OUT)(R0), INOUT                   // lvx v2,r0,r4
-       VSEL    OUTHEAD, INOUT, OUTMASK, INOUT     // vsel v2,v7,v2,v9
-       STVX    INOUT, (OUT)(R0)                   // stvx v2,r0,r4
-       NEG     IVP, ENC                           // neg r8,r7
-       MOVD    $15, IDX                           // li r10,15
-       VXOR    RNDKEY0, RNDKEY0, RNDKEY0          // vxor v0,v0,v0
-       VSPLTISB        $-1, OUTMASK               // vspltisb v9,-1
-       VSPLTISB        $0xf, TMP                  // vspltisb v3, 0xf
-       LVSR    (ENC)(R0), OUTPERM                 // lvsl v8,r0,r8
-       VPERM   OUTMASK, RNDKEY0, OUTPERM, OUTMASK // vperm v9,v9,v0,v8
-       VXOR    OUTPERM, TMP, OUTPERM              // vxor v9, v3, v9
-       LVX     (IVP)(R0), OUTHEAD                 // lvx v7,r0,r7
-       VPERM   IVEC, IVEC, OUTPERM, IVEC          // vperm v4,v4,v4,v8
-       VSEL    OUTHEAD, IVEC, OUTMASK, INOUT      // vsel v2,v7,v4,v9
-       LVX     (IVP)(IDX), INPTAIL                // lvx v5,r10,r7
-       STVX    INOUT, (IVP)(R0)                   // stvx v2,r0,r7
-       VSEL    IVEC, INPTAIL, OUTMASK, INOUT      // vsel v2,v4,v5,v9
-       STVX    INOUT, (IVP)(IDX)                  // stvx v2,r10,r7
+       VXOR    RNDKEY0, RNDKEY0, RNDKEY0          // clear key register
+       P8_STXVB16X(IVEC, R0, IVP)                 // Save ivec in BE order for next round.
        RET                                        // bclr 20,lt,0
 
similarity index 98%
rename from src/crypto/aes/cbc_ppc64le.go
rename to src/crypto/aes/cbc_ppc64x.go
index cb9ff4c84397d66a70cc7c02659c7ca028c407a6..4c9257cdd20797ec47cbc68fb10c0aa227625933 100644 (file)
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ppc64 || ppc64le
+
 package aes
 
 import (