]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/aes: improve performance for aes-cbc on ppc64le
authorLynn Boger <laboger@linux.vnet.ibm.com>
Thu, 26 Aug 2021 12:09:50 +0000 (07:09 -0500)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Thu, 3 Mar 2022 13:39:12 +0000 (13:39 +0000)
This adds an asm implementation of aes-cbc for ppc64le to
improve performance. This is ported from the
cryptogams implementation as are other functions in
crypto/aes with further description at the top of
the asm file.

Improvements on a power10:

name             old time/op   new time/op    delta
AESCBCEncrypt1K   1.67µs ± 0%    0.87µs ±-48.15%
AESCBCDecrypt1K   1.35µs ± 0%    0.43µs ±-68.48%

name             old speed     new speed      delta
AESCBCEncrypt1K  614MB/s ± 0%  1184MB/s ± 0%+92.84%
AESCBCDecrypt1K  757MB/s ± 0%  2403M/s ± 0 +217.21%

A fuzz test to compare the generic Go implemenation
against the asm implementation has been added.

Change-Id: I18613dfc95c640820b8f1c60d29df638efc7a75c
Reviewed-on: https://go-review.googlesource.com/c/go/+/355429
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Paul Murphy <murp@ibm.com>
Trust: Paul Murphy <murp@ibm.com>

src/crypto/aes/asm_ppc64le.s
src/crypto/aes/cbc_ppc64le.go [new file with mode: 0644]
src/crypto/cipher/cbc.go
src/crypto/cipher/export_test.go
src/crypto/cipher/fuzz_test.go [new file with mode: 0644]

index f3a96a3a1705bb9c392700a8a59f904f29d1f40f..5eae67532299677075b16fa1902182ebc2bed19b 100644 (file)
@@ -498,3 +498,228 @@ loop_dec:
 
        RET // blr
 
+// Remove defines from above so they can be defined here
+#undef INP
+#undef OUT
+#undef ROUNDS
+#undef KEY
+#undef TMP
+#undef OUTPERM
+#undef OUTMASK
+#undef OUTHEAD
+#undef OUTTAIL
+
+// CBC encrypt or decrypt
+// R3 src
+// R4 dst
+// R5 len
+// R6 key
+// R7 iv
+// R8 enc=1 dec=0
+// Ported from: aes_p8_cbc_encrypt
+// Register usage:
+// R9: ROUNDS
+// R10: Index
+// V0: initialized to 0
+// V3: initialized to mask
+// V4: IV
+// V5: SRC
+// V6: IV perm mask
+// V7: DST
+// V10: KEY perm mask
+
+#define INP R3
+#define OUT R4
+#define LEN R5
+#define KEY R6
+#define IVP R7
+#define ENC R8
+#define ROUNDS R9
+#define IDX R10
+
+#define RNDKEY0 V0
+#define RNDKEY1 V1
+#define INOUT V2
+#define TMP V3
+
+#define IVEC V4
+#define INPTAIL V5
+#define INPPERM V6
+#define OUTHEAD V7
+#define OUTPERM V8
+#define OUTMASK V9
+#define KEYPERM V10
+
+// Vector loads are done using LVX followed by
+// a VPERM using mask generated from previous
+// LVSL or LVSR instruction, to obtain the correct
+// bytes if address is unaligned.
+
+// Encryption is done with VCIPHER and VCIPHERLAST
+// Decryption is done with VNCIPHER and VNCIPHERLAST
+
+// Encrypt and decypt is done as follows:
+// - INOUT value is initialized in outer loop.
+// - ROUNDS value is adjusted for loop unrolling.
+// - Encryption/decryption is done in loop based on
+// adjusted ROUNDS value.
+// - Final INOUT value is encrypted/decrypted and stored.
+
+// Note: original implementation had an 8X version
+// for decryption which was omitted to avoid the
+// complexity.
+
+TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
+       MOVD    src+0(FP), INP
+       MOVD    dst+8(FP), OUT
+       MOVD    length+16(FP), LEN
+       MOVD    key+24(FP), KEY
+       MOVD    iv+32(FP), IVP
+       MOVD    enc+40(FP), ENC
+
+       CMPU    LEN, $16                  // cmpldi r5,16
+       BC      14, 0, LR                 // bltlr-
+       CMPW    ENC, $0                   // cmpwi r8,0
+       MOVD    $15, IDX                  // li r10,15
+       VXOR    RNDKEY0, RNDKEY0, RNDKEY0 // vxor v0,v0,v0
+       VSPLTISB        $0xf, TMP         // vspltisb $0xf,v3
+
+       LVX     (IVP)(R0), IVEC                    // lvx v4,r0,r7
+       LVSL    (IVP)(R0), INPPERM                 // lvsl v6,r0,r7
+       LVX     (IVP)(IDX), INPTAIL                // lvx v5,r10,r7
+       VXOR    INPPERM, TMP, INPPERM              // vxor v3, v6, v6
+       VPERM   IVEC, INPTAIL, INPPERM, IVEC       // vperm v4,v4,v5,v6
+       NEG     INP, R11                           // neg r11,r3
+       LVSR    (KEY)(R0), KEYPERM                 // lvsr v10,r0,r6
+       MOVWZ   240(KEY), ROUNDS                   // lwz r9,240(r6)
+       LVSR    (R11)(R0), V6                      // lvsr v6,r0,r11
+       LVX     (INP)(R0), INPTAIL                 // lvx v5,r0,r3
+       ADD     $15, INP                           // addi r3,r3,15
+       VXOR    INPPERM, TMP, INPPERM              // vxor v6, v3, v6
+       LVSL    (OUT)(R0), OUTPERM                 // lvsl v8,r0,r4
+       VSPLTISB        $-1, OUTMASK               // vspltisb v9,-1
+       LVX     (OUT)(R0), OUTHEAD                 // lvx v7,r0,r4
+       VPERM   OUTMASK, RNDKEY0, OUTPERM, OUTMASK // vperm v9,v9,v0,v8
+       VXOR    OUTPERM, TMP, OUTPERM              // vxor v8, v3, v8
+       SRW     $1, ROUNDS                         // rlwinm r9,r9,31,1,31
+
+       MOVD    $16, IDX    // li r10,16
+       ADD     $-1, ROUNDS // addi r9,r9,-1
+       BEQ     Lcbc_dec    // beq
+       PCALIGN $16
+
+       // Outer loop: initialize encrypted value (INOUT)
+       // Load input (INPTAIL) ivec (IVEC)
+Lcbc_enc:
+       VOR     INPTAIL, INPTAIL, INOUT            // vor v2,v5,v5
+       LVX     (INP)(R0), INPTAIL                 // lvx v5,r0,r3
+       ADD     $16, INP                           // addi r3,r3,16
+       MOVD    ROUNDS, CTR                        // mtctr r9
+       ADD     $-16, LEN                          // addi r5,r5,-16
+       LVX     (KEY)(R0), RNDKEY0                 // lvx v0,r0,r6
+       VPERM   INOUT, INPTAIL, INPPERM, INOUT     // vperm v2,v2,v5,v6
+       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
+       ADD     $16, IDX                           // addi r10,r10,16
+       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
+       VXOR    INOUT, RNDKEY0, INOUT              // vxor v2,v2,v0
+       LVX     (KEY)(IDX), RNDKEY0                // lvx v0,r10,r6
+       ADD     $16, IDX                           // addi r10,r10,16
+       VXOR    INOUT, IVEC, INOUT                 // vxor v2,v2,v4
+
+       // Encryption loop of INOUT using RNDKEY0 and RNDKEY1
+Loop_cbc_enc:
+       VPERM   RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v1,v0,v10
+       VCIPHER INOUT, RNDKEY1, INOUT              // vcipher v2,v2,v1
+       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
+       ADD     $16, IDX                           // addi r10,r10,16
+       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v0,v1,v10
+       VCIPHER INOUT, RNDKEY0, INOUT              // vcipher v2,v2,v0
+       LVX     (KEY)(IDX), RNDKEY0                // lvx v0,r10,r6
+       ADD     $16, IDX                           // addi r10,r10,16
+       BC      16, 0, Loop_cbc_enc                // bdnz Loop_cbc_enc
+
+       // Encrypt tail values and store INOUT
+       VPERM   RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v1,v0,v10
+       VCIPHER INOUT, RNDKEY1, INOUT              // vcipher v2,v2,v1
+       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
+       MOVD    $16, IDX                           // li r10,16
+       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v0,v1,v10
+       VCIPHERLAST     INOUT, RNDKEY0, IVEC       // vcipherlast v4,v2,v0
+       CMPU    LEN, $16                           // cmpldi r5,16
+       VPERM   IVEC, IVEC, OUTPERM, TMP           // vperm v3,v4,v4,v8
+       VSEL    OUTHEAD, TMP, OUTMASK, INOUT       // vsel v2,v7,v3,v9
+       VOR     TMP, TMP, OUTHEAD                  // vor v7,v3,v3
+       STVX    INOUT, (OUT)(R0)                   // stvx v2,r0,r4
+       ADD     $16, OUT                           // addi r4,r4,16
+       BGE     Lcbc_enc                           // bge Lcbc_enc
+       BR      Lcbc_done                          // b Lcbc_done
+
+       // Outer loop: initialize decrypted value (INOUT)
+       // Load input (INPTAIL) ivec (IVEC)
+Lcbc_dec:
+       VOR     INPTAIL, INPTAIL, TMP              // vor v3,v5,v5
+       LVX     (INP)(R0), INPTAIL                 // lvx v5,r0,r3
+       ADD     $16, INP                           // addi r3,r3,16
+       MOVD    ROUNDS, CTR                        // mtctr r9
+       ADD     $-16, LEN                          // addi r5,r5,-16
+       LVX     (KEY)(R0), RNDKEY0                 // lvx v0,r0,r6
+       VPERM   TMP, INPTAIL, INPPERM, TMP         // vperm v3,v3,v5,v6
+       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
+       ADD     $16, IDX                           // addi r10,r10,16
+       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
+       VXOR    TMP, RNDKEY0, INOUT                // vxor v2,v3,v0
+       LVX     (KEY)(IDX), RNDKEY0                // lvx v0,r10,r6
+       ADD     $16, IDX                           // addi r10,r10,16
+       PCALIGN $16
+
+       // Decryption loop of INOUT using RNDKEY0 and RNDKEY1
+Loop_cbc_dec:
+       VPERM   RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v0,v1,v10
+       VNCIPHER        INOUT, RNDKEY1, INOUT      // vncipher v2,v2,v1
+       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
+       ADD     $16, IDX                           // addi r10,r10,16
+       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
+       VNCIPHER        INOUT, RNDKEY0, INOUT      // vncipher v2,v2,v0
+       LVX     (KEY)(IDX), RNDKEY0                // lvx v0,r10,r6
+       ADD     $16, IDX                           // addi r10,r10,16
+       BC      16, 0, Loop_cbc_dec                // bdnz
+
+       // Decrypt tail values and store INOUT
+       VPERM   RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v0,v1,v10
+       VNCIPHER        INOUT, RNDKEY1, INOUT      // vncipher v2,v2,v1
+       LVX     (KEY)(IDX), RNDKEY1                // lvx v1,r10,r6
+       MOVD    $16, IDX                           // li r10,16
+       VPERM   RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
+       VNCIPHERLAST    INOUT, RNDKEY0, INOUT      // vncipherlast v2,v2,v0
+       CMPU    LEN, $16                           // cmpldi r5,16
+       VXOR    INOUT, IVEC, INOUT                 // vxor v2,v2,v4
+       VOR     TMP, TMP, IVEC                     // vor v4,v3,v3
+       VPERM   INOUT, INOUT, OUTPERM, TMP         // vperm v3,v2,v2,v8
+       VSEL    OUTHEAD, TMP, OUTMASK, INOUT       // vsel v2,v7,v3,v9
+       VOR     TMP, TMP, OUTHEAD                  // vor v7,v3,v3
+       STVX    INOUT, (OUT)(R0)                   // stvx v2,r0,r4
+       ADD     $16, OUT                           // addi r4,r4,16
+       BGE     Lcbc_dec                           // bge
+
+Lcbc_done:
+       ADD     $-1, OUT                           // addi r4,r4,-1
+       LVX     (OUT)(R0), INOUT                   // lvx v2,r0,r4
+       VSEL    OUTHEAD, INOUT, OUTMASK, INOUT     // vsel v2,v7,v2,v9
+       STVX    INOUT, (OUT)(R0)                   // stvx v2,r0,r4
+       NEG     IVP, ENC                           // neg r8,r7
+       MOVD    $15, IDX                           // li r10,15
+       VXOR    RNDKEY0, RNDKEY0, RNDKEY0          // vxor v0,v0,v0
+       VSPLTISB        $-1, OUTMASK               // vspltisb v9,-1
+       VSPLTISB        $0xf, TMP                  // vspltisb v3, 0xf
+       LVSR    (ENC)(R0), OUTPERM                 // lvsl v8,r0,r8
+       VPERM   OUTMASK, RNDKEY0, OUTPERM, OUTMASK // vperm v9,v9,v0,v8
+       VXOR    OUTPERM, TMP, OUTPERM              // vxor v9, v3, v9
+       LVX     (IVP)(R0), OUTHEAD                 // lvx v7,r0,r7
+       VPERM   IVEC, IVEC, OUTPERM, IVEC          // vperm v4,v4,v4,v8
+       VSEL    OUTHEAD, IVEC, OUTMASK, INOUT      // vsel v2,v7,v4,v9
+       LVX     (IVP)(IDX), INPTAIL                // lvx v5,r10,r7
+       STVX    INOUT, (IVP)(R0)                   // stvx v2,r0,r7
+       VSEL    IVEC, INPTAIL, OUTMASK, INOUT      // vsel v2,v4,v5,v9
+       STVX    INOUT, (IVP)(IDX)                  // stvx v2,r10,r7
+       RET                                        // bclr 20,lt,0
+
diff --git a/src/crypto/aes/cbc_ppc64le.go b/src/crypto/aes/cbc_ppc64le.go
new file mode 100644 (file)
index 0000000..fa8a430
--- /dev/null
@@ -0,0 +1,71 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package aes
+
+import (
+       "crypto/cipher"
+       "crypto/internal/subtle"
+)
+
+// Assert that aesCipherAsm implements the cbcEncAble and cbcDecAble interfaces.
+var _ cbcEncAble = (*aesCipherAsm)(nil)
+var _ cbcDecAble = (*aesCipherAsm)(nil)
+
+const cbcEncrypt = 1
+const cbcDecrypt = 0
+
+type cbc struct {
+       b   *aesCipherAsm
+       enc int
+       iv  [BlockSize]byte
+}
+
+func (b *aesCipherAsm) NewCBCEncrypter(iv []byte) cipher.BlockMode {
+       var c cbc
+       c.b = b
+       c.enc = cbcEncrypt
+       copy(c.iv[:], iv)
+       return &c
+}
+
+func (b *aesCipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode {
+       var c cbc
+       c.b = b
+       c.enc = cbcDecrypt
+       copy(c.iv[:], iv)
+       return &c
+}
+
+func (x *cbc) BlockSize() int { return BlockSize }
+
+// cryptBlocksChain invokes the cipher message identifying encrypt or decrypt.
+//go:noescape
+func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int)
+
+func (x *cbc) CryptBlocks(dst, src []byte) {
+       if len(src)%BlockSize != 0 {
+               panic("crypto/cipher: input not full blocks")
+       }
+       if len(dst) < len(src) {
+               panic("crypto/cipher: output smaller than input")
+       }
+       if subtle.InexactOverlap(dst[:len(src)], src) {
+               panic("crypto/cipher: invalid buffer overlap")
+       }
+       if len(src) > 0 {
+               if x.enc == cbcEncrypt {
+                       cryptBlocksChain(&src[0], &dst[0], len(src), &x.b.enc[0], &x.iv[0], x.enc)
+               } else {
+                       cryptBlocksChain(&src[0], &dst[0], len(src), &x.b.dec[0], &x.iv[0], x.enc)
+               }
+       }
+}
+
+func (x *cbc) SetIV(iv []byte) {
+       if len(iv) != BlockSize {
+               panic("cipher: incorrect length IV")
+       }
+       copy(x.iv[:], iv)
+}
index 0d07192e291aa11fae49ebf86d10ba2054387b94..a719b61e24c089e62479b1cf183786650afd9cca 100644 (file)
@@ -52,6 +52,17 @@ func NewCBCEncrypter(b Block, iv []byte) BlockMode {
        return (*cbcEncrypter)(newCBC(b, iv))
 }
 
+// newCBCGenericEncrypter returns a BlockMode which encrypts in cipher block chaining
+// mode, using the given Block. The length of iv must be the same as the
+// Block's block size. This always returns the generic non-asm encrypter for use
+// in fuzz testing.
+func newCBCGenericEncrypter(b Block, iv []byte) BlockMode {
+       if len(iv) != b.BlockSize() {
+               panic("cipher.NewCBCEncrypter: IV length must equal block size")
+       }
+       return (*cbcEncrypter)(newCBC(b, iv))
+}
+
 func (x *cbcEncrypter) BlockSize() int { return x.blockSize }
 
 func (x *cbcEncrypter) CryptBlocks(dst, src []byte) {
@@ -112,6 +123,17 @@ func NewCBCDecrypter(b Block, iv []byte) BlockMode {
        return (*cbcDecrypter)(newCBC(b, iv))
 }
 
+// newCBCGenericDecrypter returns a BlockMode which encrypts in cipher block chaining
+// mode, using the given Block. The length of iv must be the same as the
+// Block's block size. This always returns the generic non-asm decrypter for use in
+// fuzz testing.
+func newCBCGenericDecrypter(b Block, iv []byte) BlockMode {
+       if len(iv) != b.BlockSize() {
+               panic("cipher.NewCBCDecrypter: IV length must equal block size")
+       }
+       return (*cbcDecrypter)(newCBC(b, iv))
+}
+
 func (x *cbcDecrypter) BlockSize() int { return x.blockSize }
 
 func (x *cbcDecrypter) CryptBlocks(dst, src []byte) {
index cf8007ab491c4dcf34fac84927fdcabbccb12012..beb9bf5d2339700678807afc90265e8de61bbed0 100644 (file)
@@ -6,3 +6,5 @@ package cipher
 
 // Export internal functions for testing.
 var XorBytes = xorBytes
+var NewCBCGenericEncrypter = newCBCGenericEncrypter
+var NewCBCGenericDecrypter = newCBCGenericDecrypter
diff --git a/src/crypto/cipher/fuzz_test.go b/src/crypto/cipher/fuzz_test.go
new file mode 100644 (file)
index 0000000..ffceeef
--- /dev/null
@@ -0,0 +1,103 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64le
+
+package cipher_test
+
+import (
+       "bytes"
+       "crypto/aes"
+       "crypto/cipher"
+       "crypto/rand"
+       "testing"
+       "time"
+)
+
+var cbcAESFuzzTests = []struct {
+       name string
+       key  []byte
+}{
+       {
+               "CBC-AES128",
+               commonKey128,
+       },
+       {
+               "CBC-AES192",
+               commonKey192,
+       },
+       {
+               "CBC-AES256",
+               commonKey256,
+       },
+}
+
+var timeout *time.Timer
+
+const datalen = 1024
+
+func TestFuzz(t *testing.T) {
+
+       for _, ft := range cbcAESFuzzTests {
+               c, _ := aes.NewCipher(ft.key)
+
+               cbcAsm := cipher.NewCBCEncrypter(c, commonIV)
+               cbcGeneric := cipher.NewCBCGenericEncrypter(c, commonIV)
+
+               if testing.Short() {
+                       timeout = time.NewTimer(10 * time.Millisecond)
+               } else {
+                       timeout = time.NewTimer(2 * time.Second)
+               }
+
+               indata := make([]byte, datalen)
+               outgeneric := make([]byte, datalen)
+               outdata := make([]byte, datalen)
+
+       fuzzencrypt:
+               for {
+                       select {
+                       case <-timeout.C:
+                               break fuzzencrypt
+                       default:
+                       }
+
+                       rand.Read(indata[:])
+
+                       cbcGeneric.CryptBlocks(indata, outgeneric)
+                       cbcAsm.CryptBlocks(indata, outdata)
+
+                       if !bytes.Equal(outdata, outgeneric) {
+                               t.Fatalf("AES-CBC encryption does not match reference result: %x and %x, please report this error to security@golang.org", outdata, outgeneric)
+                       }
+               }
+
+               cbcAsm = cipher.NewCBCDecrypter(c, commonIV)
+               cbcGeneric = cipher.NewCBCGenericDecrypter(c, commonIV)
+
+               if testing.Short() {
+                       timeout = time.NewTimer(10 * time.Millisecond)
+               } else {
+                       timeout = time.NewTimer(2 * time.Second)
+               }
+
+       fuzzdecrypt:
+               for {
+                       select {
+                       case <-timeout.C:
+                               break fuzzdecrypt
+                       default:
+                       }
+
+                       rand.Read(indata[:])
+
+                       cbcGeneric.CryptBlocks(indata, outgeneric)
+                       cbcAsm.CryptBlocks(indata, outdata)
+
+                       if !bytes.Equal(outdata, outgeneric) {
+                               t.Fatalf("AES-CBC decryption does not match reference result: %x and %x, please report this error to security@golang.org", outdata, outgeneric)
+                       }
+               }
+       }
+}