]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/aes: make the GHASH part of AES-GCM faster
authorVlad Krasnov <vlad@cloudflare.com>
Fri, 18 Aug 2017 19:49:59 +0000 (12:49 -0700)
committerAdam Langley <agl@golang.org>
Fri, 18 Aug 2017 21:40:57 +0000 (21:40 +0000)
By processing 8 blocks in parallel GHASH achieves higher throughput on amd64

Results on Skylake i7:

benchmark                   old ns/op     new ns/op     delta
BenchmarkAESGCMSeal1K-8     316           314           -0.63%
BenchmarkAESGCMOpen1K-8     282           281           -0.35%
BenchmarkAESGCMSign8K-8     5611          1099          -80.41%
BenchmarkAESGCMSeal8K-8     1869          1922          +2.84%
BenchmarkAESGCMOpen8K-8     1718          1724          +0.35%

benchmark                   old MB/s     new MB/s     speedup
BenchmarkAESGCMSeal1K-8     3237.10      3260.94      1.01x
BenchmarkAESGCMOpen1K-8     3629.74      3638.10      1.00x
BenchmarkAESGCMSign8K-8     1459.82      7452.99      5.11x
BenchmarkAESGCMSeal8K-8     4382.45      4260.93      0.97x
BenchmarkAESGCMOpen8K-8     4766.41      4750.54      1.00x

Change-Id: I479f2a791a968caa1c516115b0b6b96a791a20d2
Reviewed-on: https://go-review.googlesource.com/57150
Reviewed-by: Adam Langley <agl@golang.org>
src/crypto/aes/gcm_amd64.s
src/crypto/cipher/benchmark_test.go

index c25badd55837c45329d33af901bcc46263aaec5f..c1fc923a759e17f588b411d43c5353d44c0d533f 100644 (file)
@@ -324,6 +324,20 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0
 #define tPtr CX
 #define autLen DX
 
+#define reduceRound(a)         MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
+#define mulRoundAAD(X ,i) \
+       MOVOU (16*(i*2))(pTbl), T1;\
+       MOVOU T1, T2;\
+       PCLMULQDQ $0x00, X, T1;\
+       PXOR T1, ACC0;\
+       PCLMULQDQ $0x11, X, T2;\
+       PXOR T2, ACC1;\
+       PSHUFD $78, X, T1;\
+       PXOR T1, X;\
+       MOVOU (16*(i*2+1))(pTbl), T1;\
+       PCLMULQDQ $0x00, X, T1;\
+       PXOR T1, ACCM
+
        MOVQ productTable+0(FP), pTbl
        MOVQ data_base+8(FP), aut
        MOVQ data_len+16(FP), autLen
@@ -333,15 +347,18 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0
        MOVOU bswapMask<>(SB), BSWAP
        MOVOU gcmPoly<>(SB), POLY
 
-       MOVOU (16*14)(pTbl), T1
-       MOVOU (16*15)(pTbl), T2
-
        TESTQ autLen, autLen
        JEQ dataBail
 
        CMPQ autLen, $13        // optimize the TLS case
-       JNE dataSinglesLoop
+       JE dataTLS
+       CMPQ autLen, $128
+       JB startSinglesLoop
+       JMP dataOctaLoop
 
+dataTLS:
+       MOVOU (16*14)(pTbl), T1
+       MOVOU (16*15)(pTbl), T2
        PXOR B0, B0
        MOVQ (aut), B0
        PINSRD $2, 8(aut), B0
@@ -349,6 +366,63 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0
        XORQ autLen, autLen
        JMP dataMul
 
+dataOctaLoop:
+               CMPQ autLen, $128
+               JB startSinglesLoop
+               SUBQ $128, autLen
+
+               MOVOU (16*0)(aut), X0
+               MOVOU (16*1)(aut), X1
+               MOVOU (16*2)(aut), X2
+               MOVOU (16*3)(aut), X3
+               MOVOU (16*4)(aut), X4
+               MOVOU (16*5)(aut), X5
+               MOVOU (16*6)(aut), X6
+               MOVOU (16*7)(aut), X7
+               LEAQ (16*8)(aut), aut
+               PSHUFB BSWAP, X0
+               PSHUFB BSWAP, X1
+               PSHUFB BSWAP, X2
+               PSHUFB BSWAP, X3
+               PSHUFB BSWAP, X4
+               PSHUFB BSWAP, X5
+               PSHUFB BSWAP, X6
+               PSHUFB BSWAP, X7
+               PXOR ACC0, X0
+
+               MOVOU (16*0)(pTbl), ACC0
+               MOVOU (16*1)(pTbl), ACCM
+               MOVOU ACC0, ACC1
+               PSHUFD $78, X0, T1
+               PXOR X0, T1
+               PCLMULQDQ $0x00, X0, ACC0
+               PCLMULQDQ $0x11, X0, ACC1
+               PCLMULQDQ $0x00, T1, ACCM
+
+               mulRoundAAD(X1, 1)
+               mulRoundAAD(X2, 2)
+               mulRoundAAD(X3, 3)
+               mulRoundAAD(X4, 4)
+               mulRoundAAD(X5, 5)
+               mulRoundAAD(X6, 6)
+               mulRoundAAD(X7, 7)
+
+               PXOR ACC0, ACCM
+               PXOR ACC1, ACCM
+               MOVOU ACCM, T0
+               PSRLDQ $8, ACCM
+               PSLLDQ $8, T0
+               PXOR ACCM, ACC1
+               PXOR T0, ACC0
+               reduceRound(ACC0)
+               reduceRound(ACC0)
+               PXOR ACC1, ACC0
+       JMP dataOctaLoop
+
+startSinglesLoop:
+       MOVOU (16*14)(pTbl), T1
+       MOVOU (16*15)(pTbl), T2
+
 dataSinglesLoop:
 
                CMPQ autLen, $16
@@ -438,7 +512,6 @@ TEXT ·gcmAesEnc(SB),0,$256-96
 #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
 #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
 #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
-#define reduceRound(a)         MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
 #define combinedRound(i) \
        MOVOU (16*i)(ks), T0;\
        AESENC T0, B0;\
index 93c40d0f4665fe50688fcb8dc07b7476a794da71..1a3f1bdfacabef9be8331ba68ed5c376cb4638d1 100644 (file)
@@ -10,6 +10,21 @@ import (
        "testing"
 )
 
+func benchmarkAESGCMSign(b *testing.B, buf []byte) {
+       b.SetBytes(int64(len(buf)))
+
+       var key [16]byte
+       var nonce [12]byte
+       aes, _ := aes.NewCipher(key[:])
+       aesgcm, _ := cipher.NewGCM(aes)
+       var out []byte
+
+       b.ResetTimer()
+       for i := 0; i < b.N; i++ {
+               out = aesgcm.Seal(out[:0], nonce[:], nil, buf)
+       }
+}
+
 func benchmarkAESGCMSeal(b *testing.B, buf []byte) {
        b.SetBytes(int64(len(buf)))
 
@@ -54,6 +69,10 @@ func BenchmarkAESGCMOpen1K(b *testing.B) {
        benchmarkAESGCMOpen(b, make([]byte, 1024))
 }
 
+func BenchmarkAESGCMSign8K(b *testing.B) {
+       benchmarkAESGCMSign(b, make([]byte, 8*1024))
+}
+
 func BenchmarkAESGCMSeal8K(b *testing.B) {
        benchmarkAESGCMSeal(b, make([]byte, 8*1024))
 }