#define tPtr CX
#define autLen DX
+#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
+#define mulRoundAAD(X ,i) \
+ MOVOU (16*(i*2))(pTbl), T1;\
+ MOVOU T1, T2;\
+ PCLMULQDQ $0x00, X, T1;\
+ PXOR T1, ACC0;\
+ PCLMULQDQ $0x11, X, T2;\
+ PXOR T2, ACC1;\
+ PSHUFD $78, X, T1;\
+ PXOR T1, X;\
+ MOVOU (16*(i*2+1))(pTbl), T1;\
+ PCLMULQDQ $0x00, X, T1;\
+ PXOR T1, ACCM
+
MOVQ productTable+0(FP), pTbl
MOVQ data_base+8(FP), aut
MOVQ data_len+16(FP), autLen
MOVOU bswapMask<>(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
- MOVOU (16*14)(pTbl), T1
- MOVOU (16*15)(pTbl), T2
-
TESTQ autLen, autLen
JEQ dataBail
CMPQ autLen, $13 // optimize the TLS case
- JNE dataSinglesLoop
+ JE dataTLS
+ CMPQ autLen, $128
+ JB startSinglesLoop
+ JMP dataOctaLoop
+dataTLS:
+ MOVOU (16*14)(pTbl), T1
+ MOVOU (16*15)(pTbl), T2
PXOR B0, B0
MOVQ (aut), B0
PINSRD $2, 8(aut), B0
XORQ autLen, autLen
JMP dataMul
+dataOctaLoop:
+ CMPQ autLen, $128
+ JB startSinglesLoop
+ SUBQ $128, autLen
+
+ MOVOU (16*0)(aut), X0
+ MOVOU (16*1)(aut), X1
+ MOVOU (16*2)(aut), X2
+ MOVOU (16*3)(aut), X3
+ MOVOU (16*4)(aut), X4
+ MOVOU (16*5)(aut), X5
+ MOVOU (16*6)(aut), X6
+ MOVOU (16*7)(aut), X7
+ LEAQ (16*8)(aut), aut
+ PSHUFB BSWAP, X0
+ PSHUFB BSWAP, X1
+ PSHUFB BSWAP, X2
+ PSHUFB BSWAP, X3
+ PSHUFB BSWAP, X4
+ PSHUFB BSWAP, X5
+ PSHUFB BSWAP, X6
+ PSHUFB BSWAP, X7
+ PXOR ACC0, X0
+
+ MOVOU (16*0)(pTbl), ACC0
+ MOVOU (16*1)(pTbl), ACCM
+ MOVOU ACC0, ACC1
+ PSHUFD $78, X0, T1
+ PXOR X0, T1
+ PCLMULQDQ $0x00, X0, ACC0
+ PCLMULQDQ $0x11, X0, ACC1
+ PCLMULQDQ $0x00, T1, ACCM
+
+ mulRoundAAD(X1, 1)
+ mulRoundAAD(X2, 2)
+ mulRoundAAD(X3, 3)
+ mulRoundAAD(X4, 4)
+ mulRoundAAD(X5, 5)
+ mulRoundAAD(X6, 6)
+ mulRoundAAD(X7, 7)
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR ACC1, ACC0
+ JMP dataOctaLoop
+
+startSinglesLoop:
+ MOVOU (16*14)(pTbl), T1
+ MOVOU (16*15)(pTbl), T2
+
dataSinglesLoop:
CMPQ autLen, $16
#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
-#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
#define combinedRound(i) \
MOVOU (16*i)(ks), T0;\
AESENC T0, B0;\
"testing"
)
+func benchmarkAESGCMSign(b *testing.B, buf []byte) {
+ b.SetBytes(int64(len(buf)))
+
+ var key [16]byte
+ var nonce [12]byte
+ aes, _ := aes.NewCipher(key[:])
+ aesgcm, _ := cipher.NewGCM(aes)
+ var out []byte
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ out = aesgcm.Seal(out[:0], nonce[:], nil, buf)
+ }
+}
+
func benchmarkAESGCMSeal(b *testing.B, buf []byte) {
b.SetBytes(int64(len(buf)))
benchmarkAESGCMOpen(b, make([]byte, 1024))
}
+func BenchmarkAESGCMSign8K(b *testing.B) {
+ benchmarkAESGCMSign(b, make([]byte, 8*1024))
+}
+
func BenchmarkAESGCMSeal8K(b *testing.B) {
benchmarkAESGCMSeal(b, make([]byte, 8*1024))
}