From: Lynn Boger Date: Thu, 26 Aug 2021 12:09:50 +0000 (-0500) Subject: crypto/aes: improve performance for aes-cbc on ppc64le X-Git-Tag: go1.19beta1~1182 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=2c6700739a6275b78a6af62707fd41783622061b;p=gostls13.git crypto/aes: improve performance for aes-cbc on ppc64le This adds an asm implementation of aes-cbc for ppc64le to improve performance. This is ported from the cryptogams implementation as are other functions in crypto/aes with further description at the top of the asm file. Improvements on a power10: name old time/op new time/op delta AESCBCEncrypt1K 1.67µs ± 0% 0.87µs ±-48.15% AESCBCDecrypt1K 1.35µs ± 0% 0.43µs ±-68.48% name old speed new speed delta AESCBCEncrypt1K 614MB/s ± 0% 1184MB/s ± 0%+92.84% AESCBCDecrypt1K 757MB/s ± 0% 2403M/s ± 0 +217.21% A fuzz test to compare the generic Go implemenation against the asm implementation has been added. Change-Id: I18613dfc95c640820b8f1c60d29df638efc7a75c Reviewed-on: https://go-review.googlesource.com/c/go/+/355429 Trust: Lynn Boger Run-TryBot: Lynn Boger TryBot-Result: Gopher Robot Reviewed-by: Paul Murphy Trust: Paul Murphy --- diff --git a/src/crypto/aes/asm_ppc64le.s b/src/crypto/aes/asm_ppc64le.s index f3a96a3a17..5eae675322 100644 --- a/src/crypto/aes/asm_ppc64le.s +++ b/src/crypto/aes/asm_ppc64le.s @@ -498,3 +498,228 @@ loop_dec: RET // blr +// Remove defines from above so they can be defined here +#undef INP +#undef OUT +#undef ROUNDS +#undef KEY +#undef TMP +#undef OUTPERM +#undef OUTMASK +#undef OUTHEAD +#undef OUTTAIL + +// CBC encrypt or decrypt +// R3 src +// R4 dst +// R5 len +// R6 key +// R7 iv +// R8 enc=1 dec=0 +// Ported from: aes_p8_cbc_encrypt +// Register usage: +// R9: ROUNDS +// R10: Index +// V0: initialized to 0 +// V3: initialized to mask +// V4: IV +// V5: SRC +// V6: IV perm mask +// V7: DST +// V10: KEY perm mask + +#define INP R3 +#define OUT R4 +#define LEN R5 +#define KEY R6 +#define IVP R7 +#define ENC R8 +#define ROUNDS R9 +#define IDX R10 + +#define RNDKEY0 V0 +#define RNDKEY1 V1 +#define INOUT V2 +#define TMP V3 + +#define IVEC V4 +#define INPTAIL V5 +#define INPPERM V6 +#define OUTHEAD V7 +#define OUTPERM V8 +#define OUTMASK V9 +#define KEYPERM V10 + +// Vector loads are done using LVX followed by +// a VPERM using mask generated from previous +// LVSL or LVSR instruction, to obtain the correct +// bytes if address is unaligned. + +// Encryption is done with VCIPHER and VCIPHERLAST +// Decryption is done with VNCIPHER and VNCIPHERLAST + +// Encrypt and decypt is done as follows: +// - INOUT value is initialized in outer loop. +// - ROUNDS value is adjusted for loop unrolling. +// - Encryption/decryption is done in loop based on +// adjusted ROUNDS value. +// - Final INOUT value is encrypted/decrypted and stored. + +// Note: original implementation had an 8X version +// for decryption which was omitted to avoid the +// complexity. + +TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0 + MOVD src+0(FP), INP + MOVD dst+8(FP), OUT + MOVD length+16(FP), LEN + MOVD key+24(FP), KEY + MOVD iv+32(FP), IVP + MOVD enc+40(FP), ENC + + CMPU LEN, $16 // cmpldi r5,16 + BC 14, 0, LR // bltlr- + CMPW ENC, $0 // cmpwi r8,0 + MOVD $15, IDX // li r10,15 + VXOR RNDKEY0, RNDKEY0, RNDKEY0 // vxor v0,v0,v0 + VSPLTISB $0xf, TMP // vspltisb $0xf,v3 + + LVX (IVP)(R0), IVEC // lvx v4,r0,r7 + LVSL (IVP)(R0), INPPERM // lvsl v6,r0,r7 + LVX (IVP)(IDX), INPTAIL // lvx v5,r10,r7 + VXOR INPPERM, TMP, INPPERM // vxor v3, v6, v6 + VPERM IVEC, INPTAIL, INPPERM, IVEC // vperm v4,v4,v5,v6 + NEG INP, R11 // neg r11,r3 + LVSR (KEY)(R0), KEYPERM // lvsr v10,r0,r6 + MOVWZ 240(KEY), ROUNDS // lwz r9,240(r6) + LVSR (R11)(R0), V6 // lvsr v6,r0,r11 + LVX (INP)(R0), INPTAIL // lvx v5,r0,r3 + ADD $15, INP // addi r3,r3,15 + VXOR INPPERM, TMP, INPPERM // vxor v6, v3, v6 + LVSL (OUT)(R0), OUTPERM // lvsl v8,r0,r4 + VSPLTISB $-1, OUTMASK // vspltisb v9,-1 + LVX (OUT)(R0), OUTHEAD // lvx v7,r0,r4 + VPERM OUTMASK, RNDKEY0, OUTPERM, OUTMASK // vperm v9,v9,v0,v8 + VXOR OUTPERM, TMP, OUTPERM // vxor v8, v3, v8 + SRW $1, ROUNDS // rlwinm r9,r9,31,1,31 + + MOVD $16, IDX // li r10,16 + ADD $-1, ROUNDS // addi r9,r9,-1 + BEQ Lcbc_dec // beq + PCALIGN $16 + + // Outer loop: initialize encrypted value (INOUT) + // Load input (INPTAIL) ivec (IVEC) +Lcbc_enc: + VOR INPTAIL, INPTAIL, INOUT // vor v2,v5,v5 + LVX (INP)(R0), INPTAIL // lvx v5,r0,r3 + ADD $16, INP // addi r3,r3,16 + MOVD ROUNDS, CTR // mtctr r9 + ADD $-16, LEN // addi r5,r5,-16 + LVX (KEY)(R0), RNDKEY0 // lvx v0,r0,r6 + VPERM INOUT, INPTAIL, INPPERM, INOUT // vperm v2,v2,v5,v6 + LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 + ADD $16, IDX // addi r10,r10,16 + VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10 + VXOR INOUT, RNDKEY0, INOUT // vxor v2,v2,v0 + LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6 + ADD $16, IDX // addi r10,r10,16 + VXOR INOUT, IVEC, INOUT // vxor v2,v2,v4 + + // Encryption loop of INOUT using RNDKEY0 and RNDKEY1 +Loop_cbc_enc: + VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v1,v0,v10 + VCIPHER INOUT, RNDKEY1, INOUT // vcipher v2,v2,v1 + LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 + ADD $16, IDX // addi r10,r10,16 + VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v0,v1,v10 + VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v0 + LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6 + ADD $16, IDX // addi r10,r10,16 + BC 16, 0, Loop_cbc_enc // bdnz Loop_cbc_enc + + // Encrypt tail values and store INOUT + VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v1,v0,v10 + VCIPHER INOUT, RNDKEY1, INOUT // vcipher v2,v2,v1 + LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 + MOVD $16, IDX // li r10,16 + VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v0,v1,v10 + VCIPHERLAST INOUT, RNDKEY0, IVEC // vcipherlast v4,v2,v0 + CMPU LEN, $16 // cmpldi r5,16 + VPERM IVEC, IVEC, OUTPERM, TMP // vperm v3,v4,v4,v8 + VSEL OUTHEAD, TMP, OUTMASK, INOUT // vsel v2,v7,v3,v9 + VOR TMP, TMP, OUTHEAD // vor v7,v3,v3 + STVX INOUT, (OUT)(R0) // stvx v2,r0,r4 + ADD $16, OUT // addi r4,r4,16 + BGE Lcbc_enc // bge Lcbc_enc + BR Lcbc_done // b Lcbc_done + + // Outer loop: initialize decrypted value (INOUT) + // Load input (INPTAIL) ivec (IVEC) +Lcbc_dec: + VOR INPTAIL, INPTAIL, TMP // vor v3,v5,v5 + LVX (INP)(R0), INPTAIL // lvx v5,r0,r3 + ADD $16, INP // addi r3,r3,16 + MOVD ROUNDS, CTR // mtctr r9 + ADD $-16, LEN // addi r5,r5,-16 + LVX (KEY)(R0), RNDKEY0 // lvx v0,r0,r6 + VPERM TMP, INPTAIL, INPPERM, TMP // vperm v3,v3,v5,v6 + LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 + ADD $16, IDX // addi r10,r10,16 + VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10 + VXOR TMP, RNDKEY0, INOUT // vxor v2,v3,v0 + LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6 + ADD $16, IDX // addi r10,r10,16 + PCALIGN $16 + + // Decryption loop of INOUT using RNDKEY0 and RNDKEY1 +Loop_cbc_dec: + VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v0,v1,v10 + VNCIPHER INOUT, RNDKEY1, INOUT // vncipher v2,v2,v1 + LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 + ADD $16, IDX // addi r10,r10,16 + VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10 + VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v0 + LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6 + ADD $16, IDX // addi r10,r10,16 + BC 16, 0, Loop_cbc_dec // bdnz + + // Decrypt tail values and store INOUT + VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v0,v1,v10 + VNCIPHER INOUT, RNDKEY1, INOUT // vncipher v2,v2,v1 + LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6 + MOVD $16, IDX // li r10,16 + VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10 + VNCIPHERLAST INOUT, RNDKEY0, INOUT // vncipherlast v2,v2,v0 + CMPU LEN, $16 // cmpldi r5,16 + VXOR INOUT, IVEC, INOUT // vxor v2,v2,v4 + VOR TMP, TMP, IVEC // vor v4,v3,v3 + VPERM INOUT, INOUT, OUTPERM, TMP // vperm v3,v2,v2,v8 + VSEL OUTHEAD, TMP, OUTMASK, INOUT // vsel v2,v7,v3,v9 + VOR TMP, TMP, OUTHEAD // vor v7,v3,v3 + STVX INOUT, (OUT)(R0) // stvx v2,r0,r4 + ADD $16, OUT // addi r4,r4,16 + BGE Lcbc_dec // bge + +Lcbc_done: + ADD $-1, OUT // addi r4,r4,-1 + LVX (OUT)(R0), INOUT // lvx v2,r0,r4 + VSEL OUTHEAD, INOUT, OUTMASK, INOUT // vsel v2,v7,v2,v9 + STVX INOUT, (OUT)(R0) // stvx v2,r0,r4 + NEG IVP, ENC // neg r8,r7 + MOVD $15, IDX // li r10,15 + VXOR RNDKEY0, RNDKEY0, RNDKEY0 // vxor v0,v0,v0 + VSPLTISB $-1, OUTMASK // vspltisb v9,-1 + VSPLTISB $0xf, TMP // vspltisb v3, 0xf + LVSR (ENC)(R0), OUTPERM // lvsl v8,r0,r8 + VPERM OUTMASK, RNDKEY0, OUTPERM, OUTMASK // vperm v9,v9,v0,v8 + VXOR OUTPERM, TMP, OUTPERM // vxor v9, v3, v9 + LVX (IVP)(R0), OUTHEAD // lvx v7,r0,r7 + VPERM IVEC, IVEC, OUTPERM, IVEC // vperm v4,v4,v4,v8 + VSEL OUTHEAD, IVEC, OUTMASK, INOUT // vsel v2,v7,v4,v9 + LVX (IVP)(IDX), INPTAIL // lvx v5,r10,r7 + STVX INOUT, (IVP)(R0) // stvx v2,r0,r7 + VSEL IVEC, INPTAIL, OUTMASK, INOUT // vsel v2,v4,v5,v9 + STVX INOUT, (IVP)(IDX) // stvx v2,r10,r7 + RET // bclr 20,lt,0 + diff --git a/src/crypto/aes/cbc_ppc64le.go b/src/crypto/aes/cbc_ppc64le.go new file mode 100644 index 0000000000..fa8a430ed4 --- /dev/null +++ b/src/crypto/aes/cbc_ppc64le.go @@ -0,0 +1,71 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package aes + +import ( + "crypto/cipher" + "crypto/internal/subtle" +) + +// Assert that aesCipherAsm implements the cbcEncAble and cbcDecAble interfaces. +var _ cbcEncAble = (*aesCipherAsm)(nil) +var _ cbcDecAble = (*aesCipherAsm)(nil) + +const cbcEncrypt = 1 +const cbcDecrypt = 0 + +type cbc struct { + b *aesCipherAsm + enc int + iv [BlockSize]byte +} + +func (b *aesCipherAsm) NewCBCEncrypter(iv []byte) cipher.BlockMode { + var c cbc + c.b = b + c.enc = cbcEncrypt + copy(c.iv[:], iv) + return &c +} + +func (b *aesCipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode { + var c cbc + c.b = b + c.enc = cbcDecrypt + copy(c.iv[:], iv) + return &c +} + +func (x *cbc) BlockSize() int { return BlockSize } + +// cryptBlocksChain invokes the cipher message identifying encrypt or decrypt. +//go:noescape +func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int) + +func (x *cbc) CryptBlocks(dst, src []byte) { + if len(src)%BlockSize != 0 { + panic("crypto/cipher: input not full blocks") + } + if len(dst) < len(src) { + panic("crypto/cipher: output smaller than input") + } + if subtle.InexactOverlap(dst[:len(src)], src) { + panic("crypto/cipher: invalid buffer overlap") + } + if len(src) > 0 { + if x.enc == cbcEncrypt { + cryptBlocksChain(&src[0], &dst[0], len(src), &x.b.enc[0], &x.iv[0], x.enc) + } else { + cryptBlocksChain(&src[0], &dst[0], len(src), &x.b.dec[0], &x.iv[0], x.enc) + } + } +} + +func (x *cbc) SetIV(iv []byte) { + if len(iv) != BlockSize { + panic("cipher: incorrect length IV") + } + copy(x.iv[:], iv) +} diff --git a/src/crypto/cipher/cbc.go b/src/crypto/cipher/cbc.go index 0d07192e29..a719b61e24 100644 --- a/src/crypto/cipher/cbc.go +++ b/src/crypto/cipher/cbc.go @@ -52,6 +52,17 @@ func NewCBCEncrypter(b Block, iv []byte) BlockMode { return (*cbcEncrypter)(newCBC(b, iv)) } +// newCBCGenericEncrypter returns a BlockMode which encrypts in cipher block chaining +// mode, using the given Block. The length of iv must be the same as the +// Block's block size. This always returns the generic non-asm encrypter for use +// in fuzz testing. +func newCBCGenericEncrypter(b Block, iv []byte) BlockMode { + if len(iv) != b.BlockSize() { + panic("cipher.NewCBCEncrypter: IV length must equal block size") + } + return (*cbcEncrypter)(newCBC(b, iv)) +} + func (x *cbcEncrypter) BlockSize() int { return x.blockSize } func (x *cbcEncrypter) CryptBlocks(dst, src []byte) { @@ -112,6 +123,17 @@ func NewCBCDecrypter(b Block, iv []byte) BlockMode { return (*cbcDecrypter)(newCBC(b, iv)) } +// newCBCGenericDecrypter returns a BlockMode which encrypts in cipher block chaining +// mode, using the given Block. The length of iv must be the same as the +// Block's block size. This always returns the generic non-asm decrypter for use in +// fuzz testing. +func newCBCGenericDecrypter(b Block, iv []byte) BlockMode { + if len(iv) != b.BlockSize() { + panic("cipher.NewCBCDecrypter: IV length must equal block size") + } + return (*cbcDecrypter)(newCBC(b, iv)) +} + func (x *cbcDecrypter) BlockSize() int { return x.blockSize } func (x *cbcDecrypter) CryptBlocks(dst, src []byte) { diff --git a/src/crypto/cipher/export_test.go b/src/crypto/cipher/export_test.go index cf8007ab49..beb9bf5d23 100644 --- a/src/crypto/cipher/export_test.go +++ b/src/crypto/cipher/export_test.go @@ -6,3 +6,5 @@ package cipher // Export internal functions for testing. var XorBytes = xorBytes +var NewCBCGenericEncrypter = newCBCGenericEncrypter +var NewCBCGenericDecrypter = newCBCGenericDecrypter diff --git a/src/crypto/cipher/fuzz_test.go b/src/crypto/cipher/fuzz_test.go new file mode 100644 index 0000000000..ffceeef5f5 --- /dev/null +++ b/src/crypto/cipher/fuzz_test.go @@ -0,0 +1,103 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64le + +package cipher_test + +import ( + "bytes" + "crypto/aes" + "crypto/cipher" + "crypto/rand" + "testing" + "time" +) + +var cbcAESFuzzTests = []struct { + name string + key []byte +}{ + { + "CBC-AES128", + commonKey128, + }, + { + "CBC-AES192", + commonKey192, + }, + { + "CBC-AES256", + commonKey256, + }, +} + +var timeout *time.Timer + +const datalen = 1024 + +func TestFuzz(t *testing.T) { + + for _, ft := range cbcAESFuzzTests { + c, _ := aes.NewCipher(ft.key) + + cbcAsm := cipher.NewCBCEncrypter(c, commonIV) + cbcGeneric := cipher.NewCBCGenericEncrypter(c, commonIV) + + if testing.Short() { + timeout = time.NewTimer(10 * time.Millisecond) + } else { + timeout = time.NewTimer(2 * time.Second) + } + + indata := make([]byte, datalen) + outgeneric := make([]byte, datalen) + outdata := make([]byte, datalen) + + fuzzencrypt: + for { + select { + case <-timeout.C: + break fuzzencrypt + default: + } + + rand.Read(indata[:]) + + cbcGeneric.CryptBlocks(indata, outgeneric) + cbcAsm.CryptBlocks(indata, outdata) + + if !bytes.Equal(outdata, outgeneric) { + t.Fatalf("AES-CBC encryption does not match reference result: %x and %x, please report this error to security@golang.org", outdata, outgeneric) + } + } + + cbcAsm = cipher.NewCBCDecrypter(c, commonIV) + cbcGeneric = cipher.NewCBCGenericDecrypter(c, commonIV) + + if testing.Short() { + timeout = time.NewTimer(10 * time.Millisecond) + } else { + timeout = time.NewTimer(2 * time.Second) + } + + fuzzdecrypt: + for { + select { + case <-timeout.C: + break fuzzdecrypt + default: + } + + rand.Read(indata[:]) + + cbcGeneric.CryptBlocks(indata, outgeneric) + cbcAsm.CryptBlocks(indata, outdata) + + if !bytes.Equal(outdata, outgeneric) { + t.Fatalf("AES-CBC decryption does not match reference result: %x and %x, please report this error to security@golang.org", outdata, outgeneric) + } + } + } +}