From b2174a16c0012c71a6c6baeb5f7e76868dc411a2 Mon Sep 17 00:00:00 2001 From: Vlad Krasnov Date: Fri, 18 Aug 2017 12:49:59 -0700 Subject: [PATCH] crypto/aes: make the GHASH part of AES-GCM faster By processing 8 blocks in parallel GHASH achieves higher throughput on amd64 Results on Skylake i7: benchmark old ns/op new ns/op delta BenchmarkAESGCMSeal1K-8 316 314 -0.63% BenchmarkAESGCMOpen1K-8 282 281 -0.35% BenchmarkAESGCMSign8K-8 5611 1099 -80.41% BenchmarkAESGCMSeal8K-8 1869 1922 +2.84% BenchmarkAESGCMOpen8K-8 1718 1724 +0.35% benchmark old MB/s new MB/s speedup BenchmarkAESGCMSeal1K-8 3237.10 3260.94 1.01x BenchmarkAESGCMOpen1K-8 3629.74 3638.10 1.00x BenchmarkAESGCMSign8K-8 1459.82 7452.99 5.11x BenchmarkAESGCMSeal8K-8 4382.45 4260.93 0.97x BenchmarkAESGCMOpen8K-8 4766.41 4750.54 1.00x Change-Id: I479f2a791a968caa1c516115b0b6b96a791a20d2 Reviewed-on: https://go-review.googlesource.com/57150 Reviewed-by: Adam Langley --- src/crypto/aes/gcm_amd64.s | 83 +++++++++++++++++++++++++++-- src/crypto/cipher/benchmark_test.go | 19 +++++++ 2 files changed, 97 insertions(+), 5 deletions(-) diff --git a/src/crypto/aes/gcm_amd64.s b/src/crypto/aes/gcm_amd64.s index c25badd558..c1fc923a75 100644 --- a/src/crypto/aes/gcm_amd64.s +++ b/src/crypto/aes/gcm_amd64.s @@ -324,6 +324,20 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0 #define tPtr CX #define autLen DX +#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a +#define mulRoundAAD(X ,i) \ + MOVOU (16*(i*2))(pTbl), T1;\ + MOVOU T1, T2;\ + PCLMULQDQ $0x00, X, T1;\ + PXOR T1, ACC0;\ + PCLMULQDQ $0x11, X, T2;\ + PXOR T2, ACC1;\ + PSHUFD $78, X, T1;\ + PXOR T1, X;\ + MOVOU (16*(i*2+1))(pTbl), T1;\ + PCLMULQDQ $0x00, X, T1;\ + PXOR T1, ACCM + MOVQ productTable+0(FP), pTbl MOVQ data_base+8(FP), aut MOVQ data_len+16(FP), autLen @@ -333,15 +347,18 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0 MOVOU bswapMask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY - MOVOU (16*14)(pTbl), T1 - MOVOU (16*15)(pTbl), T2 - TESTQ autLen, autLen JEQ dataBail CMPQ autLen, $13 // optimize the TLS case - JNE dataSinglesLoop + JE dataTLS + CMPQ autLen, $128 + JB startSinglesLoop + JMP dataOctaLoop +dataTLS: + MOVOU (16*14)(pTbl), T1 + MOVOU (16*15)(pTbl), T2 PXOR B0, B0 MOVQ (aut), B0 PINSRD $2, 8(aut), B0 @@ -349,6 +366,63 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0 XORQ autLen, autLen JMP dataMul +dataOctaLoop: + CMPQ autLen, $128 + JB startSinglesLoop + SUBQ $128, autLen + + MOVOU (16*0)(aut), X0 + MOVOU (16*1)(aut), X1 + MOVOU (16*2)(aut), X2 + MOVOU (16*3)(aut), X3 + MOVOU (16*4)(aut), X4 + MOVOU (16*5)(aut), X5 + MOVOU (16*6)(aut), X6 + MOVOU (16*7)(aut), X7 + LEAQ (16*8)(aut), aut + PSHUFB BSWAP, X0 + PSHUFB BSWAP, X1 + PSHUFB BSWAP, X2 + PSHUFB BSWAP, X3 + PSHUFB BSWAP, X4 + PSHUFB BSWAP, X5 + PSHUFB BSWAP, X6 + PSHUFB BSWAP, X7 + PXOR ACC0, X0 + + MOVOU (16*0)(pTbl), ACC0 + MOVOU (16*1)(pTbl), ACCM + MOVOU ACC0, ACC1 + PSHUFD $78, X0, T1 + PXOR X0, T1 + PCLMULQDQ $0x00, X0, ACC0 + PCLMULQDQ $0x11, X0, ACC1 + PCLMULQDQ $0x00, T1, ACCM + + mulRoundAAD(X1, 1) + mulRoundAAD(X2, 2) + mulRoundAAD(X3, 3) + mulRoundAAD(X4, 4) + mulRoundAAD(X5, 5) + mulRoundAAD(X6, 6) + mulRoundAAD(X7, 7) + + PXOR ACC0, ACCM + PXOR ACC1, ACCM + MOVOU ACCM, T0 + PSRLDQ $8, ACCM + PSLLDQ $8, T0 + PXOR ACCM, ACC1 + PXOR T0, ACC0 + reduceRound(ACC0) + reduceRound(ACC0) + PXOR ACC1, ACC0 + JMP dataOctaLoop + +startSinglesLoop: + MOVOU (16*14)(pTbl), T1 + MOVOU (16*15)(pTbl), T2 + dataSinglesLoop: CMPQ autLen, $16 @@ -438,7 +512,6 @@ TEXT ·gcmAesEnc(SB),0,$256-96 #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7 #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7 #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7 -#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a #define combinedRound(i) \ MOVOU (16*i)(ks), T0;\ AESENC T0, B0;\ diff --git a/src/crypto/cipher/benchmark_test.go b/src/crypto/cipher/benchmark_test.go index 93c40d0f46..1a3f1bdfac 100644 --- a/src/crypto/cipher/benchmark_test.go +++ b/src/crypto/cipher/benchmark_test.go @@ -10,6 +10,21 @@ import ( "testing" ) +func benchmarkAESGCMSign(b *testing.B, buf []byte) { + b.SetBytes(int64(len(buf))) + + var key [16]byte + var nonce [12]byte + aes, _ := aes.NewCipher(key[:]) + aesgcm, _ := cipher.NewGCM(aes) + var out []byte + + b.ResetTimer() + for i := 0; i < b.N; i++ { + out = aesgcm.Seal(out[:0], nonce[:], nil, buf) + } +} + func benchmarkAESGCMSeal(b *testing.B, buf []byte) { b.SetBytes(int64(len(buf))) @@ -54,6 +69,10 @@ func BenchmarkAESGCMOpen1K(b *testing.B) { benchmarkAESGCMOpen(b, make([]byte, 1024)) } +func BenchmarkAESGCMSign8K(b *testing.B) { + benchmarkAESGCMSign(b, make([]byte, 8*1024)) +} + func BenchmarkAESGCMSeal8K(b *testing.B) { benchmarkAESGCMSeal(b, make([]byte, 8*1024)) } -- 2.48.1