From: Garrett Bodley Date: Sat, 27 Jul 2024 03:17:02 +0000 (-0400) Subject: crypto/aes: Avo port of gcm_amd64.s X-Git-Tag: go1.24rc1~1008 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=6ee8c07c3cb8b20e46cd168a8bf6ebaf8243b2ac;p=gostls13.git crypto/aes: Avo port of gcm_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. The reference assembly file does not specify a frame size for some of the defined assembly functions. Avo automatically infers the frame size when generating TEXT directives, leading to a diff on those lines. Some metadata not included in the reference assembly has also been added, which leads to a diff in the lines where that parameter symbol is referenced. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="src/crypto/aes/gcm_amd64.s" REFERENCE="54fe0fd43fcf8609666c16ae6d15ed92873b1564" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) 1c1 < TEXT .gcmAesFinish(SB), NOSPLIT, $0 --- > TEXT .gcmAesFinish(SB), NOSPLIT, $0-40 44c44 < TEXT .gcmAesInit(SB), NOSPLIT, $0 --- > TEXT .gcmAesInit(SB), NOSPLIT, $0-32 131c131 < TEXT .gcmAesData(SB), NOSPLIT, $0 --- > TEXT .gcmAesData(SB), NOSPLIT, $0-40 325c325 < MOVQ dst+8(FP), DX --- > MOVQ dst_base+8(FP), DX 1207c1207 < MOVQ dst+8(FP), SI --- > MOVQ dst_base+8(FP), SI Change-Id: Iad8f8c6ea5d50ac093c8535adc9d23fbf2612fc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/601462 LUCI-TryBot-Result: Go LUCI Reviewed-by: Roland Shoemaker Reviewed-by: Filippo Valsorda Reviewed-by: Dmitri Shuralyov --- diff --git a/src/cmd/compile/internal/types2/stdlib_test.go b/src/cmd/compile/internal/types2/stdlib_test.go index 00f9fab613..7a89556bb3 100644 --- a/src/cmd/compile/internal/types2/stdlib_test.go +++ b/src/cmd/compile/internal/types2/stdlib_test.go @@ -355,6 +355,7 @@ var excluded = map[string]bool{ "builtin": true, // go.dev/issue/46027: some imports are missing for this submodule. + "crypto/aes/_asm/gcm": true, "crypto/internal/bigmod/_asm": true, "crypto/internal/edwards25519/field/_asm": true, "crypto/md5/_asm": true, diff --git a/src/crypto/aes/_asm/gcm/gcm_amd64_asm.go b/src/crypto/aes/_asm/gcm/gcm_amd64_asm.go new file mode 100644 index 0000000000..c6606822eb --- /dev/null +++ b/src/crypto/aes/_asm/gcm/gcm_amd64_asm.go @@ -0,0 +1,1568 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI +// The implementation uses some optimization as described in: +// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication +// Instruction and its Usage for Computing the GCM Mode rev. 2.02 +// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and +// Hardware + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" +) + +//go:generate go run . -out ../../gcm_amd64.s -pkg aes + +var ( + B0 VecPhysical = X0 + B1 = X1 + B2 = X2 + B3 = X3 + B4 = X4 + B5 = X5 + B6 = X6 + B7 = X7 + + ACC0 VecPhysical = X8 + ACC1 = X9 + ACCM = X10 + + T0 VecPhysical = X11 + T1 = X12 + T2 = X13 + POLY = X14 + BSWAP = X15 +) + +func main() { + Package("crypto/aes") + ConstraintExpr("!purego") + + gcmAesFinish() + gcmAesInit() + gcmAesData() + gcmAesEnc() + gcmAesDec() + + Generate() +} + +func gcmAesFinish() { + Implement("gcmAesFinish") + Attributes(NOSPLIT) + AllocLocal(0) + + var ( + pTbl GPPhysical = RDI + tMsk = RSI + tPtr = RDX + plen = RAX + dlen = RCX + ) + + Load(Param("productTable"), pTbl) + Load(Param("tagMask"), tMsk) + Load(Param("T"), tPtr) + Load(Param("pLen"), plen) + Load(Param("dLen"), dlen) + + MOVOU(Mem{Base: tPtr}, ACC0) + MOVOU(Mem{Base: tMsk}, T2) + + bswapMask := bswapMask_DATA() + gcmPoly := gcmPoly_DATA() + MOVOU(bswapMask, BSWAP) + MOVOU(gcmPoly, POLY) + + SHLQ(Imm(3), plen) + SHLQ(Imm(3), dlen) + + MOVQ(plen, B0) + PINSRQ(Imm(1), dlen, B0) + + PXOR(ACC0, B0) + + MOVOU(Mem{Base: pTbl}.Offset(16*14), ACC0) + MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM) + MOVOU(ACC0, ACC1) + + PCLMULQDQ(Imm(0x00), B0, ACC0) + PCLMULQDQ(Imm(0x11), B0, ACC1) + PSHUFD(Imm(78), B0, T0) + PXOR(B0, T0) + PCLMULQDQ(Imm(0x00), T0, ACCM) + + PXOR(ACC0, ACCM) + PXOR(ACC1, ACCM) + MOVOU(ACCM, T0) + PSRLDQ(Imm(8), ACCM) + PSLLDQ(Imm(8), T0) + PXOR(ACCM, ACC1) + PXOR(T0, ACC0) + + MOVOU(POLY, T0) + PCLMULQDQ(Imm(0x01), ACC0, T0) + PSHUFD(Imm(78), ACC0, ACC0) + PXOR(T0, ACC0) + + MOVOU(POLY, T0) + PCLMULQDQ(Imm(0x01), ACC0, T0) + PSHUFD(Imm(78), ACC0, ACC0) + PXOR(T0, ACC0) + + PXOR(ACC1, ACC0) + + PSHUFB(BSWAP, ACC0) + PXOR(T2, ACC0) + MOVOU(ACC0, Mem{Base: tPtr}) + + RET() +} + +func gcmAesInit() { + Implement("gcmAesInit") + Attributes(NOSPLIT) + AllocLocal(0) + + var ( + dst GPPhysical = RDI + KS = RSI + NR = RDX + ) + + Load(Param("productTable"), dst) + Load(Param("ks").Base(), KS) + Load(Param("ks").Len(), NR) + + SHRQ(Imm(2), NR) + DECQ(NR) + + bswapMask := bswapMask_DATA() + gcmPoly := gcmPoly_DATA() + MOVOU(bswapMask, BSWAP) + MOVOU(gcmPoly, POLY) + + Comment("Encrypt block 0, with the AES key to generate the hash key H") + MOVOU(Mem{Base: KS}.Offset(16*0), B0) + MOVOU(Mem{Base: KS}.Offset(16*1), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*2), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*3), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*4), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*5), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*6), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*7), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*8), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*9), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*10), T0) + CMPQ(NR, Imm(12)) + JB(LabelRef("initEncLast")) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*11), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*12), T0) + JE(LabelRef("initEncLast")) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*13), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: KS}.Offset(16*14), T0) + + initEncLast(dst) + initLoop(dst) + + RET() +} + +func initEncLast(dst GPPhysical) { + Label("initEncLast") + AESENCLAST(T0, B0) + + PSHUFB(BSWAP, B0) + Comment("H * 2") + PSHUFD(Imm(0xff), B0, T0) + MOVOU(B0, T1) + PSRAL(Imm(31), T0) + PAND(POLY, T0) + PSRLL(Imm(31), T1) + PSLLDQ(Imm(4), T1) + PSLLL(Imm(1), B0) + PXOR(T0, B0) + PXOR(T1, B0) + Comment("Karatsuba pre-computations") + MOVOU(B0, Mem{Base: dst}.Offset(16*14)) + PSHUFD(Imm(78), B0, B1) + PXOR(B0, B1) + MOVOU(B1, Mem{Base: dst}.Offset(16*15)) + + MOVOU(B0, B2) + MOVOU(B1, B3) + Comment("Now prepare powers of H and pre-computations for them") + MOVQ(U32(7), RAX) +} + +func initLoop(dst GPPhysical) { + Label("initLoop") + MOVOU(B2, T0) + MOVOU(B2, T1) + MOVOU(B3, T2) + PCLMULQDQ(Imm(0x00), B0, T0) + PCLMULQDQ(Imm(0x11), B0, T1) + PCLMULQDQ(Imm(0x00), B1, T2) + + PXOR(T0, T2) + PXOR(T1, T2) + MOVOU(T2, B4) + PSLLDQ(Imm(8), B4) + PSRLDQ(Imm(8), T2) + PXOR(B4, T0) + PXOR(T2, T1) + + MOVOU(POLY, B2) + PCLMULQDQ(Imm(0x01), T0, B2) + PSHUFD(Imm(78), T0, T0) + PXOR(B2, T0) + MOVOU(POLY, B2) + PCLMULQDQ(Imm(0x01), T0, B2) + PSHUFD(Imm(78), T0, T0) + PXOR(T0, B2) + PXOR(T1, B2) + + MOVOU(B2, Mem{Base: dst}.Offset(16*12)) + PSHUFD(Imm(78), B2, B3) + PXOR(B2, B3) + MOVOU(B3, Mem{Base: dst}.Offset(16*13)) + + DECQ(RAX) + LEAQ(Mem{Base: dst}.Offset(-16*2), dst) + JNE(LabelRef("initLoop")) +} + +func gcmAesData() { + Implement("gcmAesData") + Attributes(NOSPLIT) + AllocLocal(0) + + var ( + pTbl GPPhysical = RDI + aut = RSI + tPtr = RCX + autLen = RDX + ) + + Load(Param("productTable"), pTbl) + Load(Param("data").Base(), aut) + Load(Param("data").Len(), autLen) + Load(Param("T"), tPtr) + + bswapMask := bswapMask_DATA() + gcmPoly := gcmPoly_DATA() + PXOR(ACC0, ACC0) + MOVOU(bswapMask, BSWAP) + MOVOU(gcmPoly, POLY) + + TESTQ(autLen, autLen) + JEQ(LabelRef("dataBail")) + + CMPQ(autLen, Imm(13)) // optimize the TLS case + JE(LabelRef("dataTLS")) + CMPQ(autLen, Imm(128)) + JB(LabelRef("startSinglesLoop")) + JMP(LabelRef("dataOctaLoop")) + + dataTLS(pTbl, aut, autLen) + dataOctaLoop(pTbl, aut, autLen) + startSinglesLoop(pTbl) + dataSinglesLoop(aut, autLen) + dataMul(aut) + dataEnd(aut, autLen) + dataLoadLoop(aut, autLen) + dataBail(tPtr) +} + +func reduceRound(a VecPhysical) { + MOVOU(POLY, T0) + PCLMULQDQ(Imm(0x01), a, T0) + PSHUFD(Imm(78), a, a) + PXOR(T0, a) +} + +func mulRoundAAD(X VecPhysical, i int, pTbl GPPhysical) { + MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1) + MOVOU(T1, T2) + PCLMULQDQ(Imm(0x00), X, T1) + PXOR(T1, ACC0) + PCLMULQDQ(Imm(0x11), X, T2) + PXOR(T2, ACC1) + PSHUFD(Imm(78), X, T1) + PXOR(T1, X) + MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T1) + PCLMULQDQ(Imm(0x00), X, T1) + PXOR(T1, ACCM) +} + +func dataTLS(pTbl, aut, autLen GPPhysical) { + Label("dataTLS") + MOVOU(Mem{Base: pTbl}.Offset(16*14), T1) + MOVOU(Mem{Base: pTbl}.Offset(16*15), T2) + PXOR(B0, B0) + MOVQ(Mem{Base: aut}, B0) + PINSRD(Imm(2), Mem{Base: aut}.Offset(8), B0) + PINSRB(Imm(12), Mem{Base: aut}.Offset(12), B0) + XORQ(autLen, autLen) + JMP(LabelRef("dataMul")) +} + +func dataOctaLoop(pTbl, aut, autLen GPPhysical) { + Label("dataOctaLoop") + CMPQ(autLen, Imm(128)) + JB(LabelRef("startSinglesLoop")) + SUBQ(Imm(128), autLen) + + MOVOU(Mem{Base: aut}.Offset(16*0), X0) + MOVOU(Mem{Base: aut}.Offset(16*1), X1) + MOVOU(Mem{Base: aut}.Offset(16*2), X2) + MOVOU(Mem{Base: aut}.Offset(16*3), X3) + MOVOU(Mem{Base: aut}.Offset(16*4), X4) + MOVOU(Mem{Base: aut}.Offset(16*5), X5) + MOVOU(Mem{Base: aut}.Offset(16*6), X6) + MOVOU(Mem{Base: aut}.Offset(16*7), X7) + LEAQ(Mem{Base: aut}.Offset(16*8), aut) + PSHUFB(BSWAP, X0) + PSHUFB(BSWAP, X1) + PSHUFB(BSWAP, X2) + PSHUFB(BSWAP, X3) + PSHUFB(BSWAP, X4) + PSHUFB(BSWAP, X5) + PSHUFB(BSWAP, X6) + PSHUFB(BSWAP, X7) + PXOR(ACC0, X0) + + MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0) + MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM) + MOVOU(ACC0, ACC1) + PSHUFD(Imm(78), X0, T1) + PXOR(X0, T1) + PCLMULQDQ(Imm(0x00), X0, ACC0) + PCLMULQDQ(Imm(0x11), X0, ACC1) + PCLMULQDQ(Imm(0x00), T1, ACCM) + + mulRoundAAD(X1, 1, pTbl) + mulRoundAAD(X2, 2, pTbl) + mulRoundAAD(X3, 3, pTbl) + mulRoundAAD(X4, 4, pTbl) + mulRoundAAD(X5, 5, pTbl) + mulRoundAAD(X6, 6, pTbl) + mulRoundAAD(X7, 7, pTbl) + + PXOR(ACC0, ACCM) + PXOR(ACC1, ACCM) + MOVOU(ACCM, T0) + PSRLDQ(Imm(8), ACCM) + PSLLDQ(Imm(8), T0) + PXOR(ACCM, ACC1) + PXOR(T0, ACC0) + reduceRound(ACC0) + reduceRound(ACC0) + PXOR(ACC1, ACC0) + JMP(LabelRef("dataOctaLoop")) +} + +func startSinglesLoop(pTbl GPPhysical) { + Label("startSinglesLoop") + MOVOU(Mem{Base: pTbl}.Offset(16*14), T1) + MOVOU(Mem{Base: pTbl}.Offset(16*15), T2) + +} + +func dataSinglesLoop(aut, autLen GPPhysical) { + Label("dataSinglesLoop") + + CMPQ(autLen, Imm(16)) + JB(LabelRef("dataEnd")) + SUBQ(Imm(16), autLen) + + MOVOU(Mem{Base: aut}, B0) +} + +func dataMul(aut GPPhysical) { + Label("dataMul") + PSHUFB(BSWAP, B0) + PXOR(ACC0, B0) + + MOVOU(T1, ACC0) + MOVOU(T2, ACCM) + MOVOU(T1, ACC1) + + PSHUFD(Imm(78), B0, T0) + PXOR(B0, T0) + PCLMULQDQ(Imm(0x00), B0, ACC0) + PCLMULQDQ(Imm(0x11), B0, ACC1) + PCLMULQDQ(Imm(0x00), T0, ACCM) + + PXOR(ACC0, ACCM) + PXOR(ACC1, ACCM) + MOVOU(ACCM, T0) + PSRLDQ(Imm(8), ACCM) + PSLLDQ(Imm(8), T0) + PXOR(ACCM, ACC1) + PXOR(T0, ACC0) + + MOVOU(POLY, T0) + PCLMULQDQ(Imm(0x01), ACC0, T0) + PSHUFD(Imm(78), ACC0, ACC0) + PXOR(T0, ACC0) + + MOVOU(POLY, T0) + PCLMULQDQ(Imm(0x01), ACC0, T0) + PSHUFD(Imm(78), ACC0, ACC0) + PXOR(T0, ACC0) + PXOR(ACC1, ACC0) + + LEAQ(Mem{Base: aut}.Offset(16), aut) + + JMP(LabelRef("dataSinglesLoop")) +} + +func dataEnd(aut, autLen GPPhysical) { + Label("dataEnd") + + TESTQ(autLen, autLen) + JEQ(LabelRef("dataBail")) + + PXOR(B0, B0) + // LEAQ -1(aut)(autLen*1), aut + LEAQ(Mem{Base: aut, Index: autLen, Scale: 1}.Offset(-1), aut) +} + +func dataLoadLoop(aut, autLen GPPhysical) { + Label("dataLoadLoop") + + PSLLDQ(Imm(1), B0) + PINSRB(Imm(0), Mem{Base: aut}, B0) + + LEAQ(Mem{Base: aut}.Offset(-1), aut) + DECQ(autLen) + JNE(LabelRef("dataLoadLoop")) + + JMP(LabelRef("dataMul")) +} + +func dataBail(tPtr GPPhysical) { + Label("dataBail") + MOVOU(ACC0, Mem{Base: tPtr}) + RET() +} + +func gcmAesEnc() { + Implement("gcmAesEnc") + Attributes(0) + AllocLocal(256) + + var ( + pTbl GPPhysical = RDI + ctx = RDX + ctrPtr = RCX + ptx = RSI + ks = RAX + tPtr = R8 + ptxLen = R9 + aluCTR = R10L + aluTMP = R11L + aluK = R12L + NR = R13 + ) + + Load(Param("productTable"), pTbl) + Load(Param("dst").Base(), ctx) + Load(Param("src").Base(), ptx) + Load(Param("src").Len(), ptxLen) + Load(Param("ctr"), ctrPtr) + Load(Param("T"), tPtr) + Load(Param("ks").Base(), ks) + Load(Param("ks").Len(), NR) + + SHRQ(Imm(2), NR) + DECQ(NR) + + bswapMask := bswapMask_DATA() + gcmPoly := gcmPoly_DATA() + MOVOU(bswapMask, BSWAP) + MOVOU(gcmPoly, POLY) + + MOVOU(Mem{Base: tPtr}, ACC0) + PXOR(ACC1, ACC1) + PXOR(ACCM, ACCM) + MOVOU(Mem{Base: ctrPtr}, B0) + MOVL(Mem{Base: ctrPtr}.Offset(3*4), aluCTR) + MOVOU(Mem{Base: ks}, T0) + MOVL(Mem{Base: ks}.Offset(3*4), aluK) + BSWAPL(aluCTR) + BSWAPL(aluK) + + PXOR(B0, T0) + MOVOU(T0, Mem{Base: SP}.Offset(8*16+0*16)) + incrementEnc(0, aluCTR, aluTMP, aluK) + + CMPQ(ptxLen, Imm(128)) + JB(LabelRef("gcmAesEncSingles")) + SUBQ(Imm(128), ptxLen) + + Comment("We have at least 8 blocks to encrypt, prepare the rest of the counters") + MOVOU(T0, Mem{Base: SP}.Offset(8*16+1*16)) + incrementEnc(1, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(8*16+2*16)) + incrementEnc(2, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(8*16+3*16)) + incrementEnc(3, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(8*16+4*16)) + incrementEnc(4, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(8*16+5*16)) + incrementEnc(5, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(8*16+6*16)) + incrementEnc(6, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(8*16+7*16)) + incrementEnc(7, aluCTR, aluTMP, aluK) + + MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0) + MOVOU(Mem{Base: SP}.Offset(8*16+1*16), B1) + MOVOU(Mem{Base: SP}.Offset(8*16+2*16), B2) + MOVOU(Mem{Base: SP}.Offset(8*16+3*16), B3) + MOVOU(Mem{Base: SP}.Offset(8*16+4*16), B4) + MOVOU(Mem{Base: SP}.Offset(8*16+5*16), B5) + MOVOU(Mem{Base: SP}.Offset(8*16+6*16), B6) + MOVOU(Mem{Base: SP}.Offset(8*16+7*16), B7) + + aesRound(1, ks) + incrementEnc(0, aluCTR, aluTMP, aluK) + aesRound(2, ks) + incrementEnc(1, aluCTR, aluTMP, aluK) + aesRound(3, ks) + incrementEnc(2, aluCTR, aluTMP, aluK) + aesRound(4, ks) + incrementEnc(3, aluCTR, aluTMP, aluK) + aesRound(5, ks) + incrementEnc(4, aluCTR, aluTMP, aluK) + aesRound(6, ks) + incrementEnc(5, aluCTR, aluTMP, aluK) + aesRound(7, ks) + incrementEnc(6, aluCTR, aluTMP, aluK) + aesRound(8, ks) + incrementEnc(7, aluCTR, aluTMP, aluK) + aesRound(9, ks) + MOVOU(Mem{Base: ks}.Offset(16*10), T0) + CMPQ(NR, Imm(12)) + JB(LabelRef("encLast1")) + aesRnd(T0) + aesRound(11, ks) + MOVOU(Mem{Base: ks}.Offset(16*12), T0) + JE(LabelRef("encLast1")) + aesRnd(T0) + aesRound(13, ks) + MOVOU(Mem{Base: ks}.Offset(16*14), T0) + + encLast1(ctx, ptx) + gcmAesEncOctetsLoop(pTbl, ks, ptxLen, aluCTR, aluTMP, aluK, NR) + encLast2(ctx, ptx) + gcmAesEncOctetsEnd(pTbl, ptxLen, aluCTR) + gcmAesEncSingles(pTbl, ks) + gcmAesEncSinglesLoop(ks, ptxLen, aluCTR, aluTMP, aluK, NR) + encLast3(pTbl, ctx, ptx) + gcmAesEncTail(ks, ptxLen, NR) + encLast4(ptx, ptxLen, aluCTR, aluTMP) + ptxLoadLoop(pTbl, ctx, ptx, ptxLen) + gcmAesEncDone(tPtr) +} + +func incrementEnc(i int, aluCTR, aluTMP, aluK GPPhysical) { + ADDL(Imm(1), aluCTR) + MOVL(aluCTR, aluTMP) + XORL(aluK, aluTMP) + BSWAPL(aluTMP) + MOVL(aluTMP, Mem{Base: SP}.Offset(3*4+8*16+i*16)) +} + +func aesRnd(k VecPhysical) { + AESENC(k, B0) + AESENC(k, B1) + AESENC(k, B2) + AESENC(k, B3) + AESENC(k, B4) + AESENC(k, B5) + AESENC(k, B6) + AESENC(k, B7) +} + +func aesRound(i int, ks GPPhysical) { + // MOVOU (16*i)(ks), T0 + MOVOU(Mem{Base: ks}.Offset(16*i), T0) + AESENC(T0, B0) + AESENC(T0, B1) + AESENC(T0, B2) + AESENC(T0, B3) + AESENC(T0, B4) + AESENC(T0, B5) + AESENC(T0, B6) + AESENC(T0, B7) +} + +func aesRndLast(k VecPhysical) { + AESENCLAST(k, B0) + AESENCLAST(k, B1) + AESENCLAST(k, B2) + AESENCLAST(k, B3) + AESENCLAST(k, B4) + AESENCLAST(k, B5) + AESENCLAST(k, B6) + AESENCLAST(k, B7) +} + +func combinedRound(i int, pTbl, ks GPPhysical) { + MOVOU(Mem{Base: ks}.Offset(16*i), T0) + AESENC(T0, B0) + AESENC(T0, B1) + AESENC(T0, B2) + AESENC(T0, B3) + MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1) + MOVOU(T1, T2) + AESENC(T0, B4) + AESENC(T0, B5) + AESENC(T0, B6) + AESENC(T0, B7) + MOVOU(Mem{Base: SP}.Offset(16*i), T0) + PCLMULQDQ(Imm(0x00), T0, T1) + PXOR(T1, ACC0) + PSHUFD(Imm(78), T0, T1) + PCLMULQDQ(Imm(0x11), T0, T2) + PXOR(T1, T0) + PXOR(T2, ACC1) + MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T2) + PCLMULQDQ(Imm(0x00), T2, T0) + PXOR(T0, ACCM) +} + +func mulRound(i int, pTbl GPPhysical) { + MOVOU(Mem{Base: SP}.Offset(16*i), T0) + MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1) + MOVOU(T1, T2) + PCLMULQDQ(Imm(0x00), T0, T1) + PXOR(T1, ACC0) + PCLMULQDQ(Imm(0x11), T0, T2) + PXOR(T2, ACC1) + PSHUFD(Imm(78), T0, T1) + PXOR(T1, T0) + MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T1) + PCLMULQDQ(Imm(0x00), T0, T1) + PXOR(T1, ACCM) +} + +func encLast1(ctx, ptx GPPhysical) { + Label("encLast1") + aesRndLast(T0) + + MOVOU(Mem{Base: ptx}.Offset(16*0), T0) + PXOR(T0, B0) + MOVOU(Mem{Base: ptx}.Offset(16*1), T0) + PXOR(T0, B1) + MOVOU(Mem{Base: ptx}.Offset(16*2), T0) + PXOR(T0, B2) + MOVOU(Mem{Base: ptx}.Offset(16*3), T0) + PXOR(T0, B3) + MOVOU(Mem{Base: ptx}.Offset(16*4), T0) + PXOR(T0, B4) + MOVOU(Mem{Base: ptx}.Offset(16*5), T0) + PXOR(T0, B5) + MOVOU(Mem{Base: ptx}.Offset(16*6), T0) + PXOR(T0, B6) + MOVOU(Mem{Base: ptx}.Offset(16*7), T0) + PXOR(T0, B7) + + MOVOU(B0, Mem{Base: ctx}.Offset(16*0)) + PSHUFB(BSWAP, B0) + PXOR(ACC0, B0) + MOVOU(B1, Mem{Base: ctx}.Offset(16*1)) + PSHUFB(BSWAP, B1) + MOVOU(B2, Mem{Base: ctx}.Offset(16*2)) + PSHUFB(BSWAP, B2) + MOVOU(B3, Mem{Base: ctx}.Offset(16*3)) + PSHUFB(BSWAP, B3) + MOVOU(B4, Mem{Base: ctx}.Offset(16*4)) + PSHUFB(BSWAP, B4) + MOVOU(B5, Mem{Base: ctx}.Offset(16*5)) + PSHUFB(BSWAP, B5) + MOVOU(B6, Mem{Base: ctx}.Offset(16*6)) + PSHUFB(BSWAP, B6) + MOVOU(B7, Mem{Base: ctx}.Offset(16*7)) + PSHUFB(BSWAP, B7) + + MOVOU(B0, Mem{Base: SP}.Offset(16*0)) + MOVOU(B1, Mem{Base: SP}.Offset(16*1)) + MOVOU(B2, Mem{Base: SP}.Offset(16*2)) + MOVOU(B3, Mem{Base: SP}.Offset(16*3)) + MOVOU(B4, Mem{Base: SP}.Offset(16*4)) + MOVOU(B5, Mem{Base: SP}.Offset(16*5)) + MOVOU(B6, Mem{Base: SP}.Offset(16*6)) + MOVOU(B7, Mem{Base: SP}.Offset(16*7)) + + LEAQ(Mem{Base: ptx}.Offset(128), ptx) + LEAQ(Mem{Base: ctx}.Offset(128), ctx) +} + +func gcmAesEncOctetsLoop(pTbl, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) { + Label("gcmAesEncOctetsLoop") + + CMPQ(ptxLen, Imm(128)) + JB(LabelRef("gcmAesEncOctetsEnd")) + SUBQ(Imm(128), ptxLen) + + MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0) + MOVOU(Mem{Base: SP}.Offset(8*16+1*16), B1) + MOVOU(Mem{Base: SP}.Offset(8*16+2*16), B2) + MOVOU(Mem{Base: SP}.Offset(8*16+3*16), B3) + MOVOU(Mem{Base: SP}.Offset(8*16+4*16), B4) + MOVOU(Mem{Base: SP}.Offset(8*16+5*16), B5) + MOVOU(Mem{Base: SP}.Offset(8*16+6*16), B6) + MOVOU(Mem{Base: SP}.Offset(8*16+7*16), B7) + + MOVOU(Mem{Base: SP}.Offset(16*0), T0) + PSHUFD(Imm(78), T0, T1) + PXOR(T0, T1) + + MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0) + MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM) + MOVOU(ACC0, ACC1) + + PCLMULQDQ(Imm(0x00), T1, ACCM) + PCLMULQDQ(Imm(0x00), T0, ACC0) + PCLMULQDQ(Imm(0x11), T0, ACC1) + + combinedRound(1, pTbl, ks) + incrementEnc(0, aluCTR, aluTMP, aluK) + combinedRound(2, pTbl, ks) + incrementEnc(1, aluCTR, aluTMP, aluK) + combinedRound(3, pTbl, ks) + incrementEnc(2, aluCTR, aluTMP, aluK) + combinedRound(4, pTbl, ks) + incrementEnc(3, aluCTR, aluTMP, aluK) + combinedRound(5, pTbl, ks) + incrementEnc(4, aluCTR, aluTMP, aluK) + combinedRound(6, pTbl, ks) + incrementEnc(5, aluCTR, aluTMP, aluK) + combinedRound(7, pTbl, ks) + incrementEnc(6, aluCTR, aluTMP, aluK) + + aesRound(8, ks) + incrementEnc(7, aluCTR, aluTMP, aluK) + + PXOR(ACC0, ACCM) + PXOR(ACC1, ACCM) + MOVOU(ACCM, T0) + PSRLDQ(Imm(8), ACCM) + PSLLDQ(Imm(8), T0) + PXOR(ACCM, ACC1) + PXOR(T0, ACC0) + + reduceRound(ACC0) + aesRound(9, ks) + + reduceRound(ACC0) + PXOR(ACC1, ACC0) + + MOVOU(Mem{Base: ks}.Offset(16*10), T0) + CMPQ(NR, Imm(12)) + JB(LabelRef("encLast2")) + aesRnd(T0) + aesRound(11, ks) + MOVOU(Mem{Base: ks}.Offset(16*12), T0) + JE(LabelRef("encLast2")) + aesRnd(T0) + aesRound(13, ks) + MOVOU(Mem{Base: ks}.Offset(16*14), T0) +} + +func encLast2(ctx, ptx GPPhysical) { + Label("encLast2") + aesRndLast(T0) + + MOVOU(Mem{Base: ptx}.Offset(16*0), T0) + PXOR(T0, B0) + MOVOU(Mem{Base: ptx}.Offset(16*1), T0) + PXOR(T0, B1) + MOVOU(Mem{Base: ptx}.Offset(16*2), T0) + PXOR(T0, B2) + MOVOU(Mem{Base: ptx}.Offset(16*3), T0) + PXOR(T0, B3) + MOVOU(Mem{Base: ptx}.Offset(16*4), T0) + PXOR(T0, B4) + MOVOU(Mem{Base: ptx}.Offset(16*5), T0) + PXOR(T0, B5) + MOVOU(Mem{Base: ptx}.Offset(16*6), T0) + PXOR(T0, B6) + MOVOU(Mem{Base: ptx}.Offset(16*7), T0) + PXOR(T0, B7) + + MOVOU(B0, Mem{Base: ctx}.Offset(16*0)) + PSHUFB(BSWAP, B0) + PXOR(ACC0, B0) + MOVOU(B1, Mem{Base: ctx}.Offset(16*1)) + PSHUFB(BSWAP, B1) + MOVOU(B2, Mem{Base: ctx}.Offset(16*2)) + PSHUFB(BSWAP, B2) + MOVOU(B3, Mem{Base: ctx}.Offset(16*3)) + PSHUFB(BSWAP, B3) + MOVOU(B4, Mem{Base: ctx}.Offset(16*4)) + PSHUFB(BSWAP, B4) + MOVOU(B5, Mem{Base: ctx}.Offset(16*5)) + PSHUFB(BSWAP, B5) + MOVOU(B6, Mem{Base: ctx}.Offset(16*6)) + PSHUFB(BSWAP, B6) + MOVOU(B7, Mem{Base: ctx}.Offset(16*7)) + PSHUFB(BSWAP, B7) + + MOVOU(B0, Mem{Base: SP}.Offset(16*0)) + MOVOU(B1, Mem{Base: SP}.Offset(16*1)) + MOVOU(B2, Mem{Base: SP}.Offset(16*2)) + MOVOU(B3, Mem{Base: SP}.Offset(16*3)) + MOVOU(B4, Mem{Base: SP}.Offset(16*4)) + MOVOU(B5, Mem{Base: SP}.Offset(16*5)) + MOVOU(B6, Mem{Base: SP}.Offset(16*6)) + MOVOU(B7, Mem{Base: SP}.Offset(16*7)) + + LEAQ(Mem{Base: ptx}.Offset(128), ptx) + LEAQ(Mem{Base: ctx}.Offset(128), ctx) + + JMP(LabelRef("gcmAesEncOctetsLoop")) +} + +func gcmAesEncOctetsEnd(pTbl, ptxLen, aluCTR GPPhysical) { + Label("gcmAesEncOctetsEnd") + + MOVOU(Mem{Base: SP}.Offset(16*0), T0) + MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0) + MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM) + MOVOU(ACC0, ACC1) + PSHUFD(Imm(78), T0, T1) + PXOR(T0, T1) + PCLMULQDQ(Imm(0x00), T0, ACC0) + PCLMULQDQ(Imm(0x11), T0, ACC1) + PCLMULQDQ(Imm(0x00), T1, ACCM) + + mulRound(1, pTbl) + mulRound(2, pTbl) + mulRound(3, pTbl) + mulRound(4, pTbl) + mulRound(5, pTbl) + mulRound(6, pTbl) + mulRound(7, pTbl) + + PXOR(ACC0, ACCM) + PXOR(ACC1, ACCM) + MOVOU(ACCM, T0) + PSRLDQ(Imm(8), ACCM) + PSLLDQ(Imm(8), T0) + PXOR(ACCM, ACC1) + PXOR(T0, ACC0) + + reduceRound(ACC0) + reduceRound(ACC0) + PXOR(ACC1, ACC0) + + TESTQ(ptxLen, ptxLen) + JE(LabelRef("gcmAesEncDone")) + + // Hack to get Avo to emit: + // SUBQ $7, aluCTR` + Instruction(&ir.Instruction{Opcode: "SUBQ", Operands: []Op{Imm(7), aluCTR}}) +} + +func gcmAesEncSingles(pTbl, ks GPPhysical) { + Label("gcmAesEncSingles") + + MOVOU(Mem{Base: ks}.Offset(16*1), B1) + MOVOU(Mem{Base: ks}.Offset(16*2), B2) + MOVOU(Mem{Base: ks}.Offset(16*3), B3) + MOVOU(Mem{Base: ks}.Offset(16*4), B4) + MOVOU(Mem{Base: ks}.Offset(16*5), B5) + MOVOU(Mem{Base: ks}.Offset(16*6), B6) + MOVOU(Mem{Base: ks}.Offset(16*7), B7) + + MOVOU(Mem{Base: pTbl}.Offset(16*14), T2) +} + +func gcmAesEncSinglesLoop(ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) { + Label("gcmAesEncSinglesLoop") + + CMPQ(ptxLen, Imm(16)) + JB(LabelRef("gcmAesEncTail")) + SUBQ(Imm(16), ptxLen) + + MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0) + incrementEnc(0, aluCTR, aluTMP, aluK) + + AESENC(B1, B0) + AESENC(B2, B0) + AESENC(B3, B0) + AESENC(B4, B0) + AESENC(B5, B0) + AESENC(B6, B0) + AESENC(B7, B0) + MOVOU(Mem{Base: ks}.Offset(16*8), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*9), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*10), T0) + CMPQ(NR, Imm(12)) + JB(LabelRef("encLast3")) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*11), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*12), T0) + JE(LabelRef("encLast3")) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*13), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*14), T0) +} + +func encLast3(pTbl, ctx, ptx GPPhysical) { + Label("encLast3") + AESENCLAST(T0, B0) + + MOVOU(Mem{Base: ptx}, T0) + PXOR(T0, B0) + MOVOU(B0, Mem{Base: ctx}) + + PSHUFB(BSWAP, B0) + PXOR(ACC0, B0) + + MOVOU(T2, ACC0) + MOVOU(T2, ACC1) + MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM) + + PSHUFD(Imm(78), B0, T0) + PXOR(B0, T0) + PCLMULQDQ(Imm(0x00), B0, ACC0) + PCLMULQDQ(Imm(0x11), B0, ACC1) + PCLMULQDQ(Imm(0x00), T0, ACCM) + + PXOR(ACC0, ACCM) + PXOR(ACC1, ACCM) + MOVOU(ACCM, T0) + PSRLDQ(Imm(8), ACCM) + PSLLDQ(Imm(8), T0) + PXOR(ACCM, ACC1) + PXOR(T0, ACC0) + + reduceRound(ACC0) + reduceRound(ACC0) + PXOR(ACC1, ACC0) + + LEAQ(Mem{Base: ptx}.Offset(16*1), ptx) + LEAQ(Mem{Base: ctx}.Offset(16*1), ctx) + + JMP(LabelRef("gcmAesEncSinglesLoop")) +} + +func gcmAesEncTail(ks, ptxLen, NR GPPhysical) { + Label("gcmAesEncTail") + TESTQ(ptxLen, ptxLen) + JE(LabelRef("gcmAesEncDone")) + + MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0) + AESENC(B1, B0) + AESENC(B2, B0) + AESENC(B3, B0) + AESENC(B4, B0) + AESENC(B5, B0) + AESENC(B6, B0) + AESENC(B7, B0) + MOVOU(Mem{Base: ks}.Offset(16*8), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*9), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*10), T0) + CMPQ(NR, Imm(12)) + JB(LabelRef("encLast4")) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*11), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*12), T0) + JE(LabelRef("encLast4")) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*13), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*14), T0) +} + +func encLast4(ptx, ptxLen, aluCTR, aluTMP GPPhysical) { + Label("encLast4") + AESENCLAST(T0, B0) + MOVOU(B0, T0) + + LEAQ(Mem{Base: ptx, Index: ptxLen, Scale: 1}.Offset(-1), ptx) + + // Hack to get Avo to emit: + // MOVQ ptxLen, aluTMP + Instruction(&ir.Instruction{Opcode: "MOVQ", Operands: []Op{ptxLen, aluTMP}}) + // Hack to get Avo to emit: + // SHLQ $4, aluTMP + Instruction(&ir.Instruction{Opcode: "SHLQ", Operands: []Op{Imm(4), aluTMP}}) + + andMask := andMask_DATA() + // Hack to get Avo to emit: + // LEAQ andMask<>(SB), aluCTR + Instruction(&ir.Instruction{Opcode: "LEAQ", Operands: []Op{andMask, aluCTR}}) + MOVOU(Mem{Base: aluCTR, Index: aluTMP, Scale: 1}.Offset(-16), T1) + + PXOR(B0, B0) +} + +func ptxLoadLoop(pTbl, ctx, ptx, ptxLen GPPhysical) { + Label("ptxLoadLoop") + PSLLDQ(Imm(1), B0) + PINSRB(Imm(0), Mem{Base: ptx}, B0) + LEAQ(Mem{Base: ptx}.Offset(-1), ptx) + DECQ(ptxLen) + JNE(LabelRef("ptxLoadLoop")) + + PXOR(T0, B0) + PAND(T1, B0) + MOVOU(B0, Mem{Base: ctx}) + + PSHUFB(BSWAP, B0) + PXOR(ACC0, B0) + + MOVOU(T2, ACC0) + MOVOU(T2, ACC1) + MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM) + + PSHUFD(Imm(78), B0, T0) + PXOR(B0, T0) + PCLMULQDQ(Imm(0x00), B0, ACC0) + PCLMULQDQ(Imm(0x11), B0, ACC1) + PCLMULQDQ(Imm(0x00), T0, ACCM) + + PXOR(ACC0, ACCM) + PXOR(ACC1, ACCM) + MOVOU(ACCM, T0) + PSRLDQ(Imm(8), ACCM) + PSLLDQ(Imm(8), T0) + PXOR(ACCM, ACC1) + PXOR(T0, ACC0) + + reduceRound(ACC0) + reduceRound(ACC0) + PXOR(ACC1, ACC0) +} + +func gcmAesEncDone(tPtr GPPhysical) { + Label("gcmAesEncDone") + MOVOU(ACC0, Mem{Base: tPtr}) + RET() +} + +func gcmAesDec() { + Implement("gcmAesDec") + Attributes(0) + AllocLocal(128) + + var ( + pTbl GPPhysical = RDI + ctx = RDX + ctrPtr = RCX + ptx = RSI + ks = RAX + tPtr = R8 + ptxLen = R9 + aluCTR = R10L + aluTMP = R11L + aluK = R12L + NR = R13 + ) + + Load(Param("productTable"), pTbl) + Load(Param("dst").Base(), ptx) + Load(Param("src").Base(), ctx) + Load(Param("src").Len(), ptxLen) + Load(Param("ctr"), ctrPtr) + Load(Param("T"), tPtr) + Load(Param("ks").Base(), ks) + Load(Param("ks").Len(), NR) + + SHRQ(Imm(2), NR) + DECQ(NR) + + bswapMask := bswapMask_DATA() + gcmPoly := gcmPoly_DATA() + MOVOU(bswapMask, BSWAP) + MOVOU(gcmPoly, POLY) + + MOVOU(Mem{Base: tPtr}, ACC0) + PXOR(ACC1, ACC1) + PXOR(ACCM, ACCM) + MOVOU(Mem{Base: ctrPtr}, B0) + MOVL(Mem{Base: ctrPtr}.Offset(3*4), aluCTR) + MOVOU(Mem{Base: ks}, T0) + MOVL(Mem{Base: ks}.Offset(3*4), aluK) + BSWAPL(aluCTR) + BSWAPL(aluK) + + PXOR(B0, T0) + MOVOU(T0, Mem{Base: SP}.Offset(0*16)) + incrementDec(0, aluCTR, aluTMP, aluK) + + CMPQ(ptxLen, Imm(128)) + JB(LabelRef("gcmAesDecSingles")) + + MOVOU(T0, Mem{Base: SP}.Offset(1*16)) + incrementDec(1, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(2*16)) + incrementDec(2, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(3*16)) + incrementDec(3, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(4*16)) + incrementDec(4, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(5*16)) + incrementDec(5, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(6*16)) + incrementDec(6, aluCTR, aluTMP, aluK) + MOVOU(T0, Mem{Base: SP}.Offset(7*16)) + incrementDec(7, aluCTR, aluTMP, aluK) + + gcmAesDecOctetsLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR) + decLast1(ctx, ptx) + gcmAesDecEndOctets(aluCTR) + gcmAesDecSingles(pTbl, ks) + gcmAesDecSinglesLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR) + decLast2(ctx, ptx) + gcmAesDecTail(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR) + decLast3() + ptxStoreLoop(ptx, ptxLen) + gcmAesDecDone(tPtr) +} + +func incrementDec(i int, aluCTR, aluTMP, aluK GPPhysical) { + ADDL(Imm(1), aluCTR) + MOVL(aluCTR, aluTMP) + XORL(aluK, aluTMP) + BSWAPL(aluTMP) + MOVL(aluTMP, Mem{Base: SP}.Offset(3*4+i*16)) +} + +func combinedDecRound(i int, pTbl, ctx, ks GPPhysical) { + MOVOU(Mem{Base: ks}.Offset(16*i), T0) + AESENC(T0, B0) + AESENC(T0, B1) + AESENC(T0, B2) + AESENC(T0, B3) + MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1) + MOVOU(T1, T2) + AESENC(T0, B4) + AESENC(T0, B5) + AESENC(T0, B6) + AESENC(T0, B7) + MOVOU(Mem{Base: ctx}.Offset(16*i), T0) + PSHUFB(BSWAP, T0) + PCLMULQDQ(Imm(0x00), T0, T1) + PXOR(T1, ACC0) + PSHUFD(Imm(78), T0, T1) + PCLMULQDQ(Imm(0x11), T0, T2) + PXOR(T1, T0) + PXOR(T2, ACC1) + MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T2) + PCLMULQDQ(Imm(0x00), T2, T0) + PXOR(T0, ACCM) +} + +func gcmAesDecOctetsLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) { + Label("gcmAesDecOctetsLoop") + + CMPQ(ptxLen, Imm(128)) + JB(LabelRef("gcmAesDecEndOctets")) + SUBQ(Imm(128), ptxLen) + + MOVOU(Mem{Base: SP}.Offset(0*16), B0) + MOVOU(Mem{Base: SP}.Offset(1*16), B1) + MOVOU(Mem{Base: SP}.Offset(2*16), B2) + MOVOU(Mem{Base: SP}.Offset(3*16), B3) + MOVOU(Mem{Base: SP}.Offset(4*16), B4) + MOVOU(Mem{Base: SP}.Offset(5*16), B5) + MOVOU(Mem{Base: SP}.Offset(6*16), B6) + MOVOU(Mem{Base: SP}.Offset(7*16), B7) + + MOVOU(Mem{Base: ctx}.Offset(16*0), T0) + PSHUFB(BSWAP, T0) + PXOR(ACC0, T0) + PSHUFD(Imm(78), T0, T1) + PXOR(T0, T1) + + MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0) + MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM) + MOVOU(ACC0, ACC1) + + PCLMULQDQ(Imm(0x00), T1, ACCM) + PCLMULQDQ(Imm(0x00), T0, ACC0) + PCLMULQDQ(Imm(0x11), T0, ACC1) + + combinedDecRound(1, pTbl, ctx, ks) + incrementDec(0, aluCTR, aluTMP, aluK) + combinedDecRound(2, pTbl, ctx, ks) + incrementDec(1, aluCTR, aluTMP, aluK) + combinedDecRound(3, pTbl, ctx, ks) + incrementDec(2, aluCTR, aluTMP, aluK) + combinedDecRound(4, pTbl, ctx, ks) + incrementDec(3, aluCTR, aluTMP, aluK) + combinedDecRound(5, pTbl, ctx, ks) + incrementDec(4, aluCTR, aluTMP, aluK) + combinedDecRound(6, pTbl, ctx, ks) + incrementDec(5, aluCTR, aluTMP, aluK) + combinedDecRound(7, pTbl, ctx, ks) + incrementDec(6, aluCTR, aluTMP, aluK) + + aesRound(8, ks) + incrementDec(7, aluCTR, aluTMP, aluK) + + PXOR(ACC0, ACCM) + PXOR(ACC1, ACCM) + MOVOU(ACCM, T0) + PSRLDQ(Imm(8), ACCM) + PSLLDQ(Imm(8), T0) + PXOR(ACCM, ACC1) + PXOR(T0, ACC0) + + reduceRound(ACC0) + aesRound(9, ks) + + reduceRound(ACC0) + PXOR(ACC1, ACC0) + + MOVOU(Mem{Base: ks}.Offset(16*10), T0) + CMPQ(NR, Imm(12)) + JB(LabelRef("decLast1")) + aesRnd(T0) + aesRound(11, ks) + MOVOU(Mem{Base: ks}.Offset(16*12), T0) + JE(LabelRef("decLast1")) + aesRnd(T0) + aesRound(13, ks) + MOVOU(Mem{Base: ks}.Offset(16*14), T0) +} + +func decLast1(ctx, ptx GPPhysical) { + Label("decLast1") + aesRndLast(T0) + + MOVOU(Mem{Base: ctx}.Offset(16*0), T0) + PXOR(T0, B0) + MOVOU(Mem{Base: ctx}.Offset(16*1), T0) + PXOR(T0, B1) + MOVOU(Mem{Base: ctx}.Offset(16*2), T0) + PXOR(T0, B2) + MOVOU(Mem{Base: ctx}.Offset(16*3), T0) + PXOR(T0, B3) + MOVOU(Mem{Base: ctx}.Offset(16*4), T0) + PXOR(T0, B4) + MOVOU(Mem{Base: ctx}.Offset(16*5), T0) + PXOR(T0, B5) + MOVOU(Mem{Base: ctx}.Offset(16*6), T0) + PXOR(T0, B6) + MOVOU(Mem{Base: ctx}.Offset(16*7), T0) + PXOR(T0, B7) + + MOVOU(B0, Mem{Base: ptx}.Offset(16*0)) + MOVOU(B1, Mem{Base: ptx}.Offset(16*1)) + MOVOU(B2, Mem{Base: ptx}.Offset(16*2)) + MOVOU(B3, Mem{Base: ptx}.Offset(16*3)) + MOVOU(B4, Mem{Base: ptx}.Offset(16*4)) + MOVOU(B5, Mem{Base: ptx}.Offset(16*5)) + MOVOU(B6, Mem{Base: ptx}.Offset(16*6)) + MOVOU(B7, Mem{Base: ptx}.Offset(16*7)) + + LEAQ(Mem{Base: ptx}.Offset(128), ptx) + LEAQ(Mem{Base: ctx}.Offset(128), ctx) + + JMP(LabelRef("gcmAesDecOctetsLoop")) +} + +func gcmAesDecEndOctets(aluCTR GPPhysical) { + Label("gcmAesDecEndOctets") + // Hack to make Avo emit: + // SUBQ $7, aluCTR + Instruction(&ir.Instruction{Opcode: "SUBQ", Operands: []Op{Imm(7), aluCTR}}) +} + +func gcmAesDecSingles(pTbl, ks GPPhysical) { + Label("gcmAesDecSingles") + + MOVOU(Mem{Base: ks}.Offset(16*1), B1) + MOVOU(Mem{Base: ks}.Offset(16*2), B2) + MOVOU(Mem{Base: ks}.Offset(16*3), B3) + MOVOU(Mem{Base: ks}.Offset(16*4), B4) + MOVOU(Mem{Base: ks}.Offset(16*5), B5) + MOVOU(Mem{Base: ks}.Offset(16*6), B6) + MOVOU(Mem{Base: ks}.Offset(16*7), B7) + + MOVOU(Mem{Base: pTbl}.Offset(16*14), T2) +} + +func gcmAesDecSinglesLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) { + Label("gcmAesDecSinglesLoop") + + CMPQ(ptxLen, Imm(16)) + JB(LabelRef("gcmAesDecTail")) + SUBQ(Imm(16), ptxLen) + + MOVOU(Mem{Base: ctx}, B0) + MOVOU(B0, T1) + PSHUFB(BSWAP, B0) + PXOR(ACC0, B0) + + MOVOU(T2, ACC0) + MOVOU(T2, ACC1) + MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM) + + PCLMULQDQ(Imm(0x00), B0, ACC0) + PCLMULQDQ(Imm(0x11), B0, ACC1) + PSHUFD(Imm(78), B0, T0) + PXOR(B0, T0) + PCLMULQDQ(Imm(0x00), T0, ACCM) + + PXOR(ACC0, ACCM) + PXOR(ACC1, ACCM) + MOVOU(ACCM, T0) + PSRLDQ(Imm(8), ACCM) + PSLLDQ(Imm(8), T0) + PXOR(ACCM, ACC1) + PXOR(T0, ACC0) + + reduceRound(ACC0) + reduceRound(ACC0) + PXOR(ACC1, ACC0) + + MOVOU(Mem{Base: SP}.Offset(0*16), B0) + incrementDec(0, aluCTR, aluTMP, aluK) + AESENC(B1, B0) + AESENC(B2, B0) + AESENC(B3, B0) + AESENC(B4, B0) + AESENC(B5, B0) + AESENC(B6, B0) + AESENC(B7, B0) + MOVOU(Mem{Base: ks}.Offset(16*8), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*9), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*10), T0) + CMPQ(NR, Imm(12)) + JB(LabelRef("decLast2")) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*11), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*12), T0) + JE(LabelRef("decLast2")) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*13), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*14), T0) +} + +func decLast2(ctx, ptx GPPhysical) { + Label("decLast2") + AESENCLAST(T0, B0) + + PXOR(T1, B0) + MOVOU(B0, Mem{Base: ptx}) + + LEAQ(Mem{Base: ptx}.Offset(16*1), ptx) + LEAQ(Mem{Base: ctx}.Offset(16*1), ctx) + + JMP(LabelRef("gcmAesDecSinglesLoop")) +} + +func gcmAesDecTail(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) { + Label("gcmAesDecTail") + + TESTQ(ptxLen, ptxLen) + JE(LabelRef("gcmAesDecDone")) + + // Hack to get Avo to emit: + // MOVQ ptxLen, aluTMP + Instruction(&ir.Instruction{Opcode: "MOVQ", Operands: []Op{ptxLen, aluTMP}}) + // Hack to get Avo to emit: + // SHLQ $4, aluTMP + Instruction(&ir.Instruction{Opcode: "SHLQ", Operands: []Op{Imm(4), aluTMP}}) + + andMask := andMask_DATA() + // Hack to get Avo to emit: + // LEAQ andMask<>(SB), aluCTR + Instruction(&ir.Instruction{Opcode: "LEAQ", Operands: []Op{andMask, aluCTR}}) + MOVOU(Mem{Base: aluCTR, Index: aluTMP, Scale: 1}.Offset(-16), T1) + + MOVOU(Mem{Base: ctx}, B0) + PAND(T1, B0) + + MOVOU(B0, T1) + PSHUFB(BSWAP, B0) + PXOR(ACC0, B0) + + MOVOU(Mem{Base: pTbl}.Offset(16*14), ACC0) + MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM) + MOVOU(ACC0, ACC1) + + PCLMULQDQ(Imm(0x00), B0, ACC0) + PCLMULQDQ(Imm(0x11), B0, ACC1) + PSHUFD(Imm(78), B0, T0) + PXOR(B0, T0) + PCLMULQDQ(Imm(0x00), T0, ACCM) + + PXOR(ACC0, ACCM) + PXOR(ACC1, ACCM) + MOVOU(ACCM, T0) + PSRLDQ(Imm(8), ACCM) + PSLLDQ(Imm(8), T0) + PXOR(ACCM, ACC1) + PXOR(T0, ACC0) + + reduceRound(ACC0) + reduceRound(ACC0) + PXOR(ACC1, ACC0) + + MOVOU(Mem{Base: SP}.Offset(0*16), B0) + incrementDec(0, aluCTR, aluTMP, aluK) + AESENC(B1, B0) + AESENC(B2, B0) + AESENC(B3, B0) + AESENC(B4, B0) + AESENC(B5, B0) + AESENC(B6, B0) + AESENC(B7, B0) + MOVOU(Mem{Base: ks}.Offset(16*8), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*9), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*10), T0) + CMPQ(NR, Imm(12)) + JB(LabelRef("decLast3")) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*11), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*12), T0) + JE(LabelRef("decLast3")) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*13), T0) + AESENC(T0, B0) + MOVOU(Mem{Base: ks}.Offset(16*14), T0) +} + +func decLast3() { + Label("decLast3") + AESENCLAST(T0, B0) + PXOR(T1, B0) +} + +func ptxStoreLoop(ptx, ptxLen GPPhysical) { + Label("ptxStoreLoop") + PEXTRB(Imm(0), B0, Mem{Base: ptx}) + PSRLDQ(Imm(1), B0) + LEAQ(Mem{Base: ptx}.Offset(1), ptx) + DECQ(ptxLen) + + JNE(LabelRef("ptxStoreLoop")) +} + +func gcmAesDecDone(tPtr GPPhysical) { + Label("gcmAesDecDone") + MOVOU(ACC0, Mem{Base: tPtr}) + RET() +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +var bswapMask_DATA_ptr, gcmPoly_DATA_ptr, andMask_DATA_ptr *Mem + +func bswapMask_DATA() Mem { + if bswapMask_DATA_ptr != nil { + return *bswapMask_DATA_ptr + } + + bswapMask := GLOBL("bswapMask", NOPTR|RODATA) + bswapMask_DATA_ptr = &bswapMask + DATA(0x00, U64(0x08090a0b0c0d0e0f)) + DATA(0x08, U64(0x0001020304050607)) + + return bswapMask +} + +func gcmPoly_DATA() Mem { + if gcmPoly_DATA_ptr != nil { + return *gcmPoly_DATA_ptr + } + + gcmPoly := GLOBL("gcmPoly", NOPTR|RODATA) + gcmPoly_DATA_ptr = &gcmPoly + DATA(0x00, U64(0x0000000000000001)) + DATA(0x08, U64(0xc200000000000000)) + + return gcmPoly +} + +var andMask_K = [30]uint64{ + 0x00000000000000ff, + 0x0000000000000000, + 0x000000000000ffff, + 0x0000000000000000, + 0x0000000000ffffff, + 0x0000000000000000, + 0x00000000ffffffff, + 0x0000000000000000, + 0x000000ffffffffff, + 0x0000000000000000, + 0x0000ffffffffffff, + 0x0000000000000000, + 0x00ffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x00000000000000ff, + 0xffffffffffffffff, + 0x000000000000ffff, + 0xffffffffffffffff, + 0x0000000000ffffff, + 0xffffffffffffffff, + 0x00000000ffffffff, + 0xffffffffffffffff, + 0x000000ffffffffff, + 0xffffffffffffffff, + 0x0000ffffffffffff, + 0xffffffffffffffff, + 0x00ffffffffffffff, +} + +func andMask_DATA() Mem { + if andMask_DATA_ptr != nil { + return *andMask_DATA_ptr + } + andMask := GLOBL("andMask", NOPTR|RODATA) + andMask_DATA_ptr = &andMask + + for i, k := range andMask_K { + DATA(i*8, U64(k)) + } + + return andMask +} diff --git a/src/crypto/aes/_asm/gcm/go.mod b/src/crypto/aes/_asm/gcm/go.mod new file mode 100644 index 0000000000..ba78b210fb --- /dev/null +++ b/src/crypto/aes/_asm/gcm/go.mod @@ -0,0 +1,11 @@ +module std/crypto/aes/_asm/gcm + +go 1.24 + +require github.com/mmcloughlin/avo v0.6.0 + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/src/crypto/aes/_asm/gcm/go.sum b/src/crypto/aes/_asm/gcm/go.sum new file mode 100644 index 0000000000..76af484b2e --- /dev/null +++ b/src/crypto/aes/_asm/gcm/go.sum @@ -0,0 +1,8 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/src/crypto/aes/gcm_amd64.s b/src/crypto/aes/gcm_amd64.s index f787e6fd6b..7db6a4baf2 100644 --- a/src/crypto/aes/gcm_amd64.s +++ b/src/crypto/aes/gcm_amd64.s @@ -1,1288 +1,1882 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run gcm_amd64_asm.go -out ../../gcm_amd64.s -pkg aes. DO NOT EDIT. //go:build !purego -// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI -// The implementation uses some optimization as described in: -// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication -// Instruction and its Usage for Computing the GCM Mode rev. 2.02 -// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and -// Hardware - #include "textflag.h" -#define B0 X0 -#define B1 X1 -#define B2 X2 -#define B3 X3 -#define B4 X4 -#define B5 X5 -#define B6 X6 -#define B7 X7 - -#define ACC0 X8 -#define ACC1 X9 -#define ACCM X10 - -#define T0 X11 -#define T1 X12 -#define T2 X13 -#define POLY X14 -#define BSWAP X15 - -DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f -DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 - -DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 -DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 - -DATA andMask<>+0x00(SB)/8, $0x00000000000000ff -DATA andMask<>+0x08(SB)/8, $0x0000000000000000 -DATA andMask<>+0x10(SB)/8, $0x000000000000ffff -DATA andMask<>+0x18(SB)/8, $0x0000000000000000 -DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff -DATA andMask<>+0x28(SB)/8, $0x0000000000000000 -DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff -DATA andMask<>+0x38(SB)/8, $0x0000000000000000 -DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff -DATA andMask<>+0x48(SB)/8, $0x0000000000000000 -DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff -DATA andMask<>+0x58(SB)/8, $0x0000000000000000 -DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff -DATA andMask<>+0x68(SB)/8, $0x0000000000000000 -DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff -DATA andMask<>+0x78(SB)/8, $0x0000000000000000 -DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff -DATA andMask<>+0x88(SB)/8, $0x00000000000000ff -DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff -DATA andMask<>+0x98(SB)/8, $0x000000000000ffff -DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff -DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff -DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff -DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff -DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff -DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff -DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff -DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff -DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff -DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff - -GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 -GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 -GLOBL andMask<>(SB), (NOPTR+RODATA), $240 - -// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) -TEXT ·gcmAesFinish(SB),NOSPLIT,$0 -#define pTbl DI -#define tMsk SI -#define tPtr DX -#define plen AX -#define dlen CX - - MOVQ productTable+0(FP), pTbl - MOVQ tagMask+8(FP), tMsk - MOVQ T+16(FP), tPtr - MOVQ pLen+24(FP), plen - MOVQ dLen+32(FP), dlen - - MOVOU (tPtr), ACC0 - MOVOU (tMsk), T2 - - MOVOU bswapMask<>(SB), BSWAP - MOVOU gcmPoly<>(SB), POLY - - SHLQ $3, plen - SHLQ $3, dlen - - MOVQ plen, B0 - PINSRQ $1, dlen, B0 - - PXOR ACC0, B0 - - MOVOU (16*14)(pTbl), ACC0 - MOVOU (16*15)(pTbl), ACCM - MOVOU ACC0, ACC1 - - PCLMULQDQ $0x00, B0, ACC0 - PCLMULQDQ $0x11, B0, ACC1 - PSHUFD $78, B0, T0 - PXOR B0, T0 - PCLMULQDQ $0x00, T0, ACCM - - PXOR ACC0, ACCM - PXOR ACC1, ACCM - MOVOU ACCM, T0 - PSRLDQ $8, ACCM - PSLLDQ $8, T0 - PXOR ACCM, ACC1 - PXOR T0, ACC0 - - MOVOU POLY, T0 - PCLMULQDQ $0x01, ACC0, T0 - PSHUFD $78, ACC0, ACC0 - PXOR T0, ACC0 - - MOVOU POLY, T0 - PCLMULQDQ $0x01, ACC0, T0 - PSHUFD $78, ACC0, ACC0 - PXOR T0, ACC0 - - PXOR ACC1, ACC0 - - PSHUFB BSWAP, ACC0 - PXOR T2, ACC0 - MOVOU ACC0, (tPtr) - +// func gcmAesFinish(productTable *[256]byte, tagMask *[16]byte, T *[16]byte, pLen uint64, dLen uint64) +// Requires: PCLMULQDQ, SSE2, SSE4.1, SSSE3 +TEXT ·gcmAesFinish(SB), NOSPLIT, $0-40 + MOVQ productTable+0(FP), DI + MOVQ tagMask+8(FP), SI + MOVQ T+16(FP), DX + MOVQ pLen+24(FP), AX + MOVQ dLen+32(FP), CX + MOVOU (DX), X8 + MOVOU (SI), X13 + MOVOU bswapMask<>+0(SB), X15 + MOVOU gcmPoly<>+0(SB), X14 + SHLQ $0x03, AX + SHLQ $0x03, CX + MOVQ AX, X0 + PINSRQ $0x01, CX, X0 + PXOR X8, X0 + MOVOU 224(DI), X8 + MOVOU 240(DI), X10 + MOVOU X8, X9 + PCLMULQDQ $0x00, X0, X8 + PCLMULQDQ $0x11, X0, X9 + PSHUFD $0x4e, X0, X11 + PXOR X0, X11 + PCLMULQDQ $0x00, X11, X10 + PXOR X8, X10 + PXOR X9, X10 + MOVOU X10, X11 + PSRLDQ $0x08, X10 + PSLLDQ $0x08, X11 + PXOR X10, X9 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + PXOR X9, X8 + PSHUFB X15, X8 + PXOR X13, X8 + MOVOU X8, (DX) RET -#undef pTbl -#undef tMsk -#undef tPtr -#undef plen -#undef dlen - -// func gcmAesInit(productTable *[256]byte, ks []uint32) -TEXT ·gcmAesInit(SB),NOSPLIT,$0 -#define dst DI -#define KS SI -#define NR DX - MOVQ productTable+0(FP), dst - MOVQ ks_base+8(FP), KS - MOVQ ks_len+16(FP), NR +DATA bswapMask<>+0(SB)/8, $0x08090a0b0c0d0e0f +DATA bswapMask<>+8(SB)/8, $0x0001020304050607 +GLOBL bswapMask<>(SB), RODATA|NOPTR, $16 - SHRQ $2, NR - DECQ NR +DATA gcmPoly<>+0(SB)/8, $0x0000000000000001 +DATA gcmPoly<>+8(SB)/8, $0xc200000000000000 +GLOBL gcmPoly<>(SB), RODATA|NOPTR, $16 - MOVOU bswapMask<>(SB), BSWAP - MOVOU gcmPoly<>(SB), POLY +// func gcmAesInit(productTable *[256]byte, ks []uint32) +// Requires: AES, PCLMULQDQ, SSE2, SSSE3 +TEXT ·gcmAesInit(SB), NOSPLIT, $0-32 + MOVQ productTable+0(FP), DI + MOVQ ks_base+8(FP), SI + MOVQ ks_len+16(FP), DX + SHRQ $0x02, DX + DECQ DX + MOVOU bswapMask<>+0(SB), X15 + MOVOU gcmPoly<>+0(SB), X14 // Encrypt block 0, with the AES key to generate the hash key H - MOVOU (16*0)(KS), B0 - MOVOU (16*1)(KS), T0 - AESENC T0, B0 - MOVOU (16*2)(KS), T0 - AESENC T0, B0 - MOVOU (16*3)(KS), T0 - AESENC T0, B0 - MOVOU (16*4)(KS), T0 - AESENC T0, B0 - MOVOU (16*5)(KS), T0 - AESENC T0, B0 - MOVOU (16*6)(KS), T0 - AESENC T0, B0 - MOVOU (16*7)(KS), T0 - AESENC T0, B0 - MOVOU (16*8)(KS), T0 - AESENC T0, B0 - MOVOU (16*9)(KS), T0 - AESENC T0, B0 - MOVOU (16*10)(KS), T0 - CMPQ NR, $12 - JB initEncLast - AESENC T0, B0 - MOVOU (16*11)(KS), T0 - AESENC T0, B0 - MOVOU (16*12)(KS), T0 - JE initEncLast - AESENC T0, B0 - MOVOU (16*13)(KS), T0 - AESENC T0, B0 - MOVOU (16*14)(KS), T0 + MOVOU (SI), X0 + MOVOU 16(SI), X11 + AESENC X11, X0 + MOVOU 32(SI), X11 + AESENC X11, X0 + MOVOU 48(SI), X11 + AESENC X11, X0 + MOVOU 64(SI), X11 + AESENC X11, X0 + MOVOU 80(SI), X11 + AESENC X11, X0 + MOVOU 96(SI), X11 + AESENC X11, X0 + MOVOU 112(SI), X11 + AESENC X11, X0 + MOVOU 128(SI), X11 + AESENC X11, X0 + MOVOU 144(SI), X11 + AESENC X11, X0 + MOVOU 160(SI), X11 + CMPQ DX, $0x0c + JB initEncLast + AESENC X11, X0 + MOVOU 176(SI), X11 + AESENC X11, X0 + MOVOU 192(SI), X11 + JE initEncLast + AESENC X11, X0 + MOVOU 208(SI), X11 + AESENC X11, X0 + MOVOU 224(SI), X11 + initEncLast: - AESENCLAST T0, B0 + AESENCLAST X11, X0 + PSHUFB X15, X0 - PSHUFB BSWAP, B0 // H * 2 - PSHUFD $0xff, B0, T0 - MOVOU B0, T1 - PSRAL $31, T0 - PAND POLY, T0 - PSRLL $31, T1 - PSLLDQ $4, T1 - PSLLL $1, B0 - PXOR T0, B0 - PXOR T1, B0 + PSHUFD $0xff, X0, X11 + MOVOU X0, X12 + PSRAL $0x1f, X11 + PAND X14, X11 + PSRLL $0x1f, X12 + PSLLDQ $0x04, X12 + PSLLL $0x01, X0 + PXOR X11, X0 + PXOR X12, X0 + // Karatsuba pre-computations - MOVOU B0, (16*14)(dst) - PSHUFD $78, B0, B1 - PXOR B0, B1 - MOVOU B1, (16*15)(dst) + MOVOU X0, 224(DI) + PSHUFD $0x4e, X0, X1 + PXOR X0, X1 + MOVOU X1, 240(DI) + MOVOU X0, X2 + MOVOU X1, X3 - MOVOU B0, B2 - MOVOU B1, B3 // Now prepare powers of H and pre-computations for them - MOVQ $7, AX + MOVQ $0x00000007, AX initLoop: - MOVOU B2, T0 - MOVOU B2, T1 - MOVOU B3, T2 - PCLMULQDQ $0x00, B0, T0 - PCLMULQDQ $0x11, B0, T1 - PCLMULQDQ $0x00, B1, T2 - - PXOR T0, T2 - PXOR T1, T2 - MOVOU T2, B4 - PSLLDQ $8, B4 - PSRLDQ $8, T2 - PXOR B4, T0 - PXOR T2, T1 - - MOVOU POLY, B2 - PCLMULQDQ $0x01, T0, B2 - PSHUFD $78, T0, T0 - PXOR B2, T0 - MOVOU POLY, B2 - PCLMULQDQ $0x01, T0, B2 - PSHUFD $78, T0, T0 - PXOR T0, B2 - PXOR T1, B2 - - MOVOU B2, (16*12)(dst) - PSHUFD $78, B2, B3 - PXOR B2, B3 - MOVOU B3, (16*13)(dst) - - DECQ AX - LEAQ (-16*2)(dst), dst - JNE initLoop - + MOVOU X2, X11 + MOVOU X2, X12 + MOVOU X3, X13 + PCLMULQDQ $0x00, X0, X11 + PCLMULQDQ $0x11, X0, X12 + PCLMULQDQ $0x00, X1, X13 + PXOR X11, X13 + PXOR X12, X13 + MOVOU X13, X4 + PSLLDQ $0x08, X4 + PSRLDQ $0x08, X13 + PXOR X4, X11 + PXOR X13, X12 + MOVOU X14, X2 + PCLMULQDQ $0x01, X11, X2 + PSHUFD $0x4e, X11, X11 + PXOR X2, X11 + MOVOU X14, X2 + PCLMULQDQ $0x01, X11, X2 + PSHUFD $0x4e, X11, X11 + PXOR X11, X2 + PXOR X12, X2 + MOVOU X2, 192(DI) + PSHUFD $0x4e, X2, X3 + PXOR X2, X3 + MOVOU X3, 208(DI) + DECQ AX + LEAQ -32(DI), DI + JNE initLoop RET -#undef NR -#undef KS -#undef dst // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) -TEXT ·gcmAesData(SB),NOSPLIT,$0 -#define pTbl DI -#define aut SI -#define tPtr CX -#define autLen DX - -#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a -#define mulRoundAAD(X ,i) \ - MOVOU (16*(i*2))(pTbl), T1;\ - MOVOU T1, T2;\ - PCLMULQDQ $0x00, X, T1;\ - PXOR T1, ACC0;\ - PCLMULQDQ $0x11, X, T2;\ - PXOR T2, ACC1;\ - PSHUFD $78, X, T1;\ - PXOR T1, X;\ - MOVOU (16*(i*2+1))(pTbl), T1;\ - PCLMULQDQ $0x00, X, T1;\ - PXOR T1, ACCM - - MOVQ productTable+0(FP), pTbl - MOVQ data_base+8(FP), aut - MOVQ data_len+16(FP), autLen - MOVQ T+32(FP), tPtr - - PXOR ACC0, ACC0 - MOVOU bswapMask<>(SB), BSWAP - MOVOU gcmPoly<>(SB), POLY - - TESTQ autLen, autLen - JEQ dataBail - - CMPQ autLen, $13 // optimize the TLS case - JE dataTLS - CMPQ autLen, $128 - JB startSinglesLoop - JMP dataOctaLoop +// Requires: PCLMULQDQ, SSE2, SSE4.1, SSSE3 +TEXT ·gcmAesData(SB), NOSPLIT, $0-40 + MOVQ productTable+0(FP), DI + MOVQ data_base+8(FP), SI + MOVQ data_len+16(FP), DX + MOVQ T+32(FP), CX + PXOR X8, X8 + MOVOU bswapMask<>+0(SB), X15 + MOVOU gcmPoly<>+0(SB), X14 + TESTQ DX, DX + JEQ dataBail + CMPQ DX, $0x0d + JE dataTLS + CMPQ DX, $0x80 + JB startSinglesLoop + JMP dataOctaLoop dataTLS: - MOVOU (16*14)(pTbl), T1 - MOVOU (16*15)(pTbl), T2 - PXOR B0, B0 - MOVQ (aut), B0 - PINSRD $2, 8(aut), B0 - PINSRB $12, 12(aut), B0 - XORQ autLen, autLen - JMP dataMul + MOVOU 224(DI), X12 + MOVOU 240(DI), X13 + PXOR X0, X0 + MOVQ (SI), X0 + PINSRD $0x02, 8(SI), X0 + PINSRB $0x0c, 12(SI), X0 + XORQ DX, DX + JMP dataMul dataOctaLoop: - CMPQ autLen, $128 - JB startSinglesLoop - SUBQ $128, autLen - - MOVOU (16*0)(aut), X0 - MOVOU (16*1)(aut), X1 - MOVOU (16*2)(aut), X2 - MOVOU (16*3)(aut), X3 - MOVOU (16*4)(aut), X4 - MOVOU (16*5)(aut), X5 - MOVOU (16*6)(aut), X6 - MOVOU (16*7)(aut), X7 - LEAQ (16*8)(aut), aut - PSHUFB BSWAP, X0 - PSHUFB BSWAP, X1 - PSHUFB BSWAP, X2 - PSHUFB BSWAP, X3 - PSHUFB BSWAP, X4 - PSHUFB BSWAP, X5 - PSHUFB BSWAP, X6 - PSHUFB BSWAP, X7 - PXOR ACC0, X0 - - MOVOU (16*0)(pTbl), ACC0 - MOVOU (16*1)(pTbl), ACCM - MOVOU ACC0, ACC1 - PSHUFD $78, X0, T1 - PXOR X0, T1 - PCLMULQDQ $0x00, X0, ACC0 - PCLMULQDQ $0x11, X0, ACC1 - PCLMULQDQ $0x00, T1, ACCM - - mulRoundAAD(X1, 1) - mulRoundAAD(X2, 2) - mulRoundAAD(X3, 3) - mulRoundAAD(X4, 4) - mulRoundAAD(X5, 5) - mulRoundAAD(X6, 6) - mulRoundAAD(X7, 7) - - PXOR ACC0, ACCM - PXOR ACC1, ACCM - MOVOU ACCM, T0 - PSRLDQ $8, ACCM - PSLLDQ $8, T0 - PXOR ACCM, ACC1 - PXOR T0, ACC0 - reduceRound(ACC0) - reduceRound(ACC0) - PXOR ACC1, ACC0 - JMP dataOctaLoop + CMPQ DX, $0x80 + JB startSinglesLoop + SUBQ $0x80, DX + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + LEAQ 128(SI), SI + PSHUFB X15, X0 + PSHUFB X15, X1 + PSHUFB X15, X2 + PSHUFB X15, X3 + PSHUFB X15, X4 + PSHUFB X15, X5 + PSHUFB X15, X6 + PSHUFB X15, X7 + PXOR X8, X0 + MOVOU (DI), X8 + MOVOU 16(DI), X10 + MOVOU X8, X9 + PSHUFD $0x4e, X0, X12 + PXOR X0, X12 + PCLMULQDQ $0x00, X0, X8 + PCLMULQDQ $0x11, X0, X9 + PCLMULQDQ $0x00, X12, X10 + MOVOU 32(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X1, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X1, X13 + PXOR X13, X9 + PSHUFD $0x4e, X1, X12 + PXOR X12, X1 + MOVOU 48(DI), X12 + PCLMULQDQ $0x00, X1, X12 + PXOR X12, X10 + MOVOU 64(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X2, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X2, X13 + PXOR X13, X9 + PSHUFD $0x4e, X2, X12 + PXOR X12, X2 + MOVOU 80(DI), X12 + PCLMULQDQ $0x00, X2, X12 + PXOR X12, X10 + MOVOU 96(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X3, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X3, X13 + PXOR X13, X9 + PSHUFD $0x4e, X3, X12 + PXOR X12, X3 + MOVOU 112(DI), X12 + PCLMULQDQ $0x00, X3, X12 + PXOR X12, X10 + MOVOU 128(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X4, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X4, X13 + PXOR X13, X9 + PSHUFD $0x4e, X4, X12 + PXOR X12, X4 + MOVOU 144(DI), X12 + PCLMULQDQ $0x00, X4, X12 + PXOR X12, X10 + MOVOU 160(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X5, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X5, X13 + PXOR X13, X9 + PSHUFD $0x4e, X5, X12 + PXOR X12, X5 + MOVOU 176(DI), X12 + PCLMULQDQ $0x00, X5, X12 + PXOR X12, X10 + MOVOU 192(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X6, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X6, X13 + PXOR X13, X9 + PSHUFD $0x4e, X6, X12 + PXOR X12, X6 + MOVOU 208(DI), X12 + PCLMULQDQ $0x00, X6, X12 + PXOR X12, X10 + MOVOU 224(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X7, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X7, X13 + PXOR X13, X9 + PSHUFD $0x4e, X7, X12 + PXOR X12, X7 + MOVOU 240(DI), X12 + PCLMULQDQ $0x00, X7, X12 + PXOR X12, X10 + PXOR X8, X10 + PXOR X9, X10 + MOVOU X10, X11 + PSRLDQ $0x08, X10 + PSLLDQ $0x08, X11 + PXOR X10, X9 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + PXOR X9, X8 + JMP dataOctaLoop startSinglesLoop: - MOVOU (16*14)(pTbl), T1 - MOVOU (16*15)(pTbl), T2 + MOVOU 224(DI), X12 + MOVOU 240(DI), X13 dataSinglesLoop: + CMPQ DX, $0x10 + JB dataEnd + SUBQ $0x10, DX + MOVOU (SI), X0 - CMPQ autLen, $16 - JB dataEnd - SUBQ $16, autLen - - MOVOU (aut), B0 dataMul: - PSHUFB BSWAP, B0 - PXOR ACC0, B0 - - MOVOU T1, ACC0 - MOVOU T2, ACCM - MOVOU T1, ACC1 - - PSHUFD $78, B0, T0 - PXOR B0, T0 - PCLMULQDQ $0x00, B0, ACC0 - PCLMULQDQ $0x11, B0, ACC1 - PCLMULQDQ $0x00, T0, ACCM - - PXOR ACC0, ACCM - PXOR ACC1, ACCM - MOVOU ACCM, T0 - PSRLDQ $8, ACCM - PSLLDQ $8, T0 - PXOR ACCM, ACC1 - PXOR T0, ACC0 - - MOVOU POLY, T0 - PCLMULQDQ $0x01, ACC0, T0 - PSHUFD $78, ACC0, ACC0 - PXOR T0, ACC0 - - MOVOU POLY, T0 - PCLMULQDQ $0x01, ACC0, T0 - PSHUFD $78, ACC0, ACC0 - PXOR T0, ACC0 - PXOR ACC1, ACC0 - - LEAQ 16(aut), aut - - JMP dataSinglesLoop + PSHUFB X15, X0 + PXOR X8, X0 + MOVOU X12, X8 + MOVOU X13, X10 + MOVOU X12, X9 + PSHUFD $0x4e, X0, X11 + PXOR X0, X11 + PCLMULQDQ $0x00, X0, X8 + PCLMULQDQ $0x11, X0, X9 + PCLMULQDQ $0x00, X11, X10 + PXOR X8, X10 + PXOR X9, X10 + MOVOU X10, X11 + PSRLDQ $0x08, X10 + PSLLDQ $0x08, X11 + PXOR X10, X9 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + PXOR X9, X8 + LEAQ 16(SI), SI + JMP dataSinglesLoop dataEnd: - - TESTQ autLen, autLen - JEQ dataBail - - PXOR B0, B0 - LEAQ -1(aut)(autLen*1), aut + TESTQ DX, DX + JEQ dataBail + PXOR X0, X0 + LEAQ -1(SI)(DX*1), SI dataLoadLoop: - - PSLLDQ $1, B0 - PINSRB $0, (aut), B0 - - LEAQ -1(aut), aut - DECQ autLen - JNE dataLoadLoop - - JMP dataMul + PSLLDQ $0x01, X0 + PINSRB $0x00, (SI), X0 + LEAQ -1(SI), SI + DECQ DX + JNE dataLoadLoop + JMP dataMul dataBail: - MOVOU ACC0, (tPtr) + MOVOU X8, (CX) RET -#undef pTbl -#undef aut -#undef tPtr -#undef autLen - -// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) -TEXT ·gcmAesEnc(SB),0,$256-96 -#define pTbl DI -#define ctx DX -#define ctrPtr CX -#define ptx SI -#define ks AX -#define tPtr R8 -#define ptxLen R9 -#define aluCTR R10 -#define aluTMP R11 -#define aluK R12 -#define NR R13 - -#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) -#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7 -#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7 -#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7 -#define combinedRound(i) \ - MOVOU (16*i)(ks), T0;\ - AESENC T0, B0;\ - AESENC T0, B1;\ - AESENC T0, B2;\ - AESENC T0, B3;\ - MOVOU (16*(i*2))(pTbl), T1;\ - MOVOU T1, T2;\ - AESENC T0, B4;\ - AESENC T0, B5;\ - AESENC T0, B6;\ - AESENC T0, B7;\ - MOVOU (16*i)(SP), T0;\ - PCLMULQDQ $0x00, T0, T1;\ - PXOR T1, ACC0;\ - PSHUFD $78, T0, T1;\ - PCLMULQDQ $0x11, T0, T2;\ - PXOR T1, T0;\ - PXOR T2, ACC1;\ - MOVOU (16*(i*2+1))(pTbl), T2;\ - PCLMULQDQ $0x00, T2, T0;\ - PXOR T0, ACCM -#define mulRound(i) \ - MOVOU (16*i)(SP), T0;\ - MOVOU (16*(i*2))(pTbl), T1;\ - MOVOU T1, T2;\ - PCLMULQDQ $0x00, T0, T1;\ - PXOR T1, ACC0;\ - PCLMULQDQ $0x11, T0, T2;\ - PXOR T2, ACC1;\ - PSHUFD $78, T0, T1;\ - PXOR T1, T0;\ - MOVOU (16*(i*2+1))(pTbl), T1;\ - PCLMULQDQ $0x00, T0, T1;\ - PXOR T1, ACCM - - MOVQ productTable+0(FP), pTbl - MOVQ dst+8(FP), ctx - MOVQ src_base+32(FP), ptx - MOVQ src_len+40(FP), ptxLen - MOVQ ctr+56(FP), ctrPtr - MOVQ T+64(FP), tPtr - MOVQ ks_base+72(FP), ks - MOVQ ks_len+80(FP), NR - SHRQ $2, NR - DECQ NR - - MOVOU bswapMask<>(SB), BSWAP - MOVOU gcmPoly<>(SB), POLY - - MOVOU (tPtr), ACC0 - PXOR ACC1, ACC1 - PXOR ACCM, ACCM - MOVOU (ctrPtr), B0 - MOVL (3*4)(ctrPtr), aluCTR - MOVOU (ks), T0 - MOVL (3*4)(ks), aluK - BSWAPL aluCTR - BSWAPL aluK - - PXOR B0, T0 - MOVOU T0, (8*16 + 0*16)(SP) - increment(0) - - CMPQ ptxLen, $128 - JB gcmAesEncSingles - SUBQ $128, ptxLen +// func gcmAesEnc(productTable *[256]byte, dst []byte, src []byte, ctr *[16]byte, T *[16]byte, ks []uint32) +// Requires: AES, PCLMULQDQ, SSE2, SSE4.1, SSSE3 +TEXT ·gcmAesEnc(SB), $256-96 + MOVQ productTable+0(FP), DI + MOVQ dst_base+8(FP), DX + MOVQ src_base+32(FP), SI + MOVQ src_len+40(FP), R9 + MOVQ ctr+56(FP), CX + MOVQ T+64(FP), R8 + MOVQ ks_base+72(FP), AX + MOVQ ks_len+80(FP), R13 + SHRQ $0x02, R13 + DECQ R13 + MOVOU bswapMask<>+0(SB), X15 + MOVOU gcmPoly<>+0(SB), X14 + MOVOU (R8), X8 + PXOR X9, X9 + PXOR X10, X10 + MOVOU (CX), X0 + MOVL 12(CX), R10 + MOVOU (AX), X11 + MOVL 12(AX), R12 + BSWAPL R10 + BSWAPL R12 + PXOR X0, X11 + MOVOU X11, 128(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 140(SP) + CMPQ R9, $0x80 + JB gcmAesEncSingles + SUBQ $0x80, R9 // We have at least 8 blocks to encrypt, prepare the rest of the counters - MOVOU T0, (8*16 + 1*16)(SP) - increment(1) - MOVOU T0, (8*16 + 2*16)(SP) - increment(2) - MOVOU T0, (8*16 + 3*16)(SP) - increment(3) - MOVOU T0, (8*16 + 4*16)(SP) - increment(4) - MOVOU T0, (8*16 + 5*16)(SP) - increment(5) - MOVOU T0, (8*16 + 6*16)(SP) - increment(6) - MOVOU T0, (8*16 + 7*16)(SP) - increment(7) - - MOVOU (8*16 + 0*16)(SP), B0 - MOVOU (8*16 + 1*16)(SP), B1 - MOVOU (8*16 + 2*16)(SP), B2 - MOVOU (8*16 + 3*16)(SP), B3 - MOVOU (8*16 + 4*16)(SP), B4 - MOVOU (8*16 + 5*16)(SP), B5 - MOVOU (8*16 + 6*16)(SP), B6 - MOVOU (8*16 + 7*16)(SP), B7 + MOVOU X11, 144(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 156(SP) + MOVOU X11, 160(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 172(SP) + MOVOU X11, 176(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 188(SP) + MOVOU X11, 192(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 204(SP) + MOVOU X11, 208(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 220(SP) + MOVOU X11, 224(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 236(SP) + MOVOU X11, 240(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 252(SP) + MOVOU 128(SP), X0 + MOVOU 144(SP), X1 + MOVOU 160(SP), X2 + MOVOU 176(SP), X3 + MOVOU 192(SP), X4 + MOVOU 208(SP), X5 + MOVOU 224(SP), X6 + MOVOU 240(SP), X7 + MOVOU 16(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 140(SP) + MOVOU 32(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 156(SP) + MOVOU 48(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 172(SP) + MOVOU 64(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 188(SP) + MOVOU 80(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 204(SP) + MOVOU 96(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 220(SP) + MOVOU 112(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 236(SP) + MOVOU 128(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 252(SP) + MOVOU 144(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 160(AX), X11 + CMPQ R13, $0x0c + JB encLast1 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 176(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 192(AX), X11 + JE encLast1 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 208(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 224(AX), X11 - aesRound(1) - increment(0) - aesRound(2) - increment(1) - aesRound(3) - increment(2) - aesRound(4) - increment(3) - aesRound(5) - increment(4) - aesRound(6) - increment(5) - aesRound(7) - increment(6) - aesRound(8) - increment(7) - aesRound(9) - MOVOU (16*10)(ks), T0 - CMPQ NR, $12 - JB encLast1 - aesRnd(T0) - aesRound(11) - MOVOU (16*12)(ks), T0 - JE encLast1 - aesRnd(T0) - aesRound(13) - MOVOU (16*14)(ks), T0 encLast1: - aesRndLast(T0) - - MOVOU (16*0)(ptx), T0 - PXOR T0, B0 - MOVOU (16*1)(ptx), T0 - PXOR T0, B1 - MOVOU (16*2)(ptx), T0 - PXOR T0, B2 - MOVOU (16*3)(ptx), T0 - PXOR T0, B3 - MOVOU (16*4)(ptx), T0 - PXOR T0, B4 - MOVOU (16*5)(ptx), T0 - PXOR T0, B5 - MOVOU (16*6)(ptx), T0 - PXOR T0, B6 - MOVOU (16*7)(ptx), T0 - PXOR T0, B7 - - MOVOU B0, (16*0)(ctx) - PSHUFB BSWAP, B0 - PXOR ACC0, B0 - MOVOU B1, (16*1)(ctx) - PSHUFB BSWAP, B1 - MOVOU B2, (16*2)(ctx) - PSHUFB BSWAP, B2 - MOVOU B3, (16*3)(ctx) - PSHUFB BSWAP, B3 - MOVOU B4, (16*4)(ctx) - PSHUFB BSWAP, B4 - MOVOU B5, (16*5)(ctx) - PSHUFB BSWAP, B5 - MOVOU B6, (16*6)(ctx) - PSHUFB BSWAP, B6 - MOVOU B7, (16*7)(ctx) - PSHUFB BSWAP, B7 - - MOVOU B0, (16*0)(SP) - MOVOU B1, (16*1)(SP) - MOVOU B2, (16*2)(SP) - MOVOU B3, (16*3)(SP) - MOVOU B4, (16*4)(SP) - MOVOU B5, (16*5)(SP) - MOVOU B6, (16*6)(SP) - MOVOU B7, (16*7)(SP) - - LEAQ 128(ptx), ptx - LEAQ 128(ctx), ctx + AESENCLAST X11, X0 + AESENCLAST X11, X1 + AESENCLAST X11, X2 + AESENCLAST X11, X3 + AESENCLAST X11, X4 + AESENCLAST X11, X5 + AESENCLAST X11, X6 + AESENCLAST X11, X7 + MOVOU (SI), X11 + PXOR X11, X0 + MOVOU 16(SI), X11 + PXOR X11, X1 + MOVOU 32(SI), X11 + PXOR X11, X2 + MOVOU 48(SI), X11 + PXOR X11, X3 + MOVOU 64(SI), X11 + PXOR X11, X4 + MOVOU 80(SI), X11 + PXOR X11, X5 + MOVOU 96(SI), X11 + PXOR X11, X6 + MOVOU 112(SI), X11 + PXOR X11, X7 + MOVOU X0, (DX) + PSHUFB X15, X0 + PXOR X8, X0 + MOVOU X1, 16(DX) + PSHUFB X15, X1 + MOVOU X2, 32(DX) + PSHUFB X15, X2 + MOVOU X3, 48(DX) + PSHUFB X15, X3 + MOVOU X4, 64(DX) + PSHUFB X15, X4 + MOVOU X5, 80(DX) + PSHUFB X15, X5 + MOVOU X6, 96(DX) + PSHUFB X15, X6 + MOVOU X7, 112(DX) + PSHUFB X15, X7 + MOVOU X0, (SP) + MOVOU X1, 16(SP) + MOVOU X2, 32(SP) + MOVOU X3, 48(SP) + MOVOU X4, 64(SP) + MOVOU X5, 80(SP) + MOVOU X6, 96(SP) + MOVOU X7, 112(SP) + LEAQ 128(SI), SI + LEAQ 128(DX), DX gcmAesEncOctetsLoop: + CMPQ R9, $0x80 + JB gcmAesEncOctetsEnd + SUBQ $0x80, R9 + MOVOU 128(SP), X0 + MOVOU 144(SP), X1 + MOVOU 160(SP), X2 + MOVOU 176(SP), X3 + MOVOU 192(SP), X4 + MOVOU 208(SP), X5 + MOVOU 224(SP), X6 + MOVOU 240(SP), X7 + MOVOU (SP), X11 + PSHUFD $0x4e, X11, X12 + PXOR X11, X12 + MOVOU (DI), X8 + MOVOU 16(DI), X10 + MOVOU X8, X9 + PCLMULQDQ $0x00, X12, X10 + PCLMULQDQ $0x00, X11, X8 + PCLMULQDQ $0x11, X11, X9 + MOVOU 16(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 32(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 16(SP), X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 48(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 140(SP) + MOVOU 32(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 64(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 32(SP), X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 80(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 156(SP) + MOVOU 48(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 96(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 48(SP), X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 112(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 172(SP) + MOVOU 64(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 128(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 64(SP), X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 144(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 188(SP) + MOVOU 80(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 160(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 80(SP), X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 176(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 204(SP) + MOVOU 96(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 192(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 96(SP), X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 208(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 220(SP) + MOVOU 112(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 224(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 112(SP), X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 240(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 236(SP) + MOVOU 128(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 252(SP) + PXOR X8, X10 + PXOR X9, X10 + MOVOU X10, X11 + PSRLDQ $0x08, X10 + PSLLDQ $0x08, X11 + PXOR X10, X9 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + MOVOU 144(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + PXOR X9, X8 + MOVOU 160(AX), X11 + CMPQ R13, $0x0c + JB encLast2 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 176(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 192(AX), X11 + JE encLast2 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 208(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 224(AX), X11 - CMPQ ptxLen, $128 - JB gcmAesEncOctetsEnd - SUBQ $128, ptxLen - - MOVOU (8*16 + 0*16)(SP), B0 - MOVOU (8*16 + 1*16)(SP), B1 - MOVOU (8*16 + 2*16)(SP), B2 - MOVOU (8*16 + 3*16)(SP), B3 - MOVOU (8*16 + 4*16)(SP), B4 - MOVOU (8*16 + 5*16)(SP), B5 - MOVOU (8*16 + 6*16)(SP), B6 - MOVOU (8*16 + 7*16)(SP), B7 - - MOVOU (16*0)(SP), T0 - PSHUFD $78, T0, T1 - PXOR T0, T1 - - MOVOU (16*0)(pTbl), ACC0 - MOVOU (16*1)(pTbl), ACCM - MOVOU ACC0, ACC1 - - PCLMULQDQ $0x00, T1, ACCM - PCLMULQDQ $0x00, T0, ACC0 - PCLMULQDQ $0x11, T0, ACC1 - - combinedRound(1) - increment(0) - combinedRound(2) - increment(1) - combinedRound(3) - increment(2) - combinedRound(4) - increment(3) - combinedRound(5) - increment(4) - combinedRound(6) - increment(5) - combinedRound(7) - increment(6) - - aesRound(8) - increment(7) - - PXOR ACC0, ACCM - PXOR ACC1, ACCM - MOVOU ACCM, T0 - PSRLDQ $8, ACCM - PSLLDQ $8, T0 - PXOR ACCM, ACC1 - PXOR T0, ACC0 - - reduceRound(ACC0) - aesRound(9) - - reduceRound(ACC0) - PXOR ACC1, ACC0 - - MOVOU (16*10)(ks), T0 - CMPQ NR, $12 - JB encLast2 - aesRnd(T0) - aesRound(11) - MOVOU (16*12)(ks), T0 - JE encLast2 - aesRnd(T0) - aesRound(13) - MOVOU (16*14)(ks), T0 encLast2: - aesRndLast(T0) - - MOVOU (16*0)(ptx), T0 - PXOR T0, B0 - MOVOU (16*1)(ptx), T0 - PXOR T0, B1 - MOVOU (16*2)(ptx), T0 - PXOR T0, B2 - MOVOU (16*3)(ptx), T0 - PXOR T0, B3 - MOVOU (16*4)(ptx), T0 - PXOR T0, B4 - MOVOU (16*5)(ptx), T0 - PXOR T0, B5 - MOVOU (16*6)(ptx), T0 - PXOR T0, B6 - MOVOU (16*7)(ptx), T0 - PXOR T0, B7 - - MOVOU B0, (16*0)(ctx) - PSHUFB BSWAP, B0 - PXOR ACC0, B0 - MOVOU B1, (16*1)(ctx) - PSHUFB BSWAP, B1 - MOVOU B2, (16*2)(ctx) - PSHUFB BSWAP, B2 - MOVOU B3, (16*3)(ctx) - PSHUFB BSWAP, B3 - MOVOU B4, (16*4)(ctx) - PSHUFB BSWAP, B4 - MOVOU B5, (16*5)(ctx) - PSHUFB BSWAP, B5 - MOVOU B6, (16*6)(ctx) - PSHUFB BSWAP, B6 - MOVOU B7, (16*7)(ctx) - PSHUFB BSWAP, B7 - - MOVOU B0, (16*0)(SP) - MOVOU B1, (16*1)(SP) - MOVOU B2, (16*2)(SP) - MOVOU B3, (16*3)(SP) - MOVOU B4, (16*4)(SP) - MOVOU B5, (16*5)(SP) - MOVOU B6, (16*6)(SP) - MOVOU B7, (16*7)(SP) - - LEAQ 128(ptx), ptx - LEAQ 128(ctx), ctx - - JMP gcmAesEncOctetsLoop + AESENCLAST X11, X0 + AESENCLAST X11, X1 + AESENCLAST X11, X2 + AESENCLAST X11, X3 + AESENCLAST X11, X4 + AESENCLAST X11, X5 + AESENCLAST X11, X6 + AESENCLAST X11, X7 + MOVOU (SI), X11 + PXOR X11, X0 + MOVOU 16(SI), X11 + PXOR X11, X1 + MOVOU 32(SI), X11 + PXOR X11, X2 + MOVOU 48(SI), X11 + PXOR X11, X3 + MOVOU 64(SI), X11 + PXOR X11, X4 + MOVOU 80(SI), X11 + PXOR X11, X5 + MOVOU 96(SI), X11 + PXOR X11, X6 + MOVOU 112(SI), X11 + PXOR X11, X7 + MOVOU X0, (DX) + PSHUFB X15, X0 + PXOR X8, X0 + MOVOU X1, 16(DX) + PSHUFB X15, X1 + MOVOU X2, 32(DX) + PSHUFB X15, X2 + MOVOU X3, 48(DX) + PSHUFB X15, X3 + MOVOU X4, 64(DX) + PSHUFB X15, X4 + MOVOU X5, 80(DX) + PSHUFB X15, X5 + MOVOU X6, 96(DX) + PSHUFB X15, X6 + MOVOU X7, 112(DX) + PSHUFB X15, X7 + MOVOU X0, (SP) + MOVOU X1, 16(SP) + MOVOU X2, 32(SP) + MOVOU X3, 48(SP) + MOVOU X4, 64(SP) + MOVOU X5, 80(SP) + MOVOU X6, 96(SP) + MOVOU X7, 112(SP) + LEAQ 128(SI), SI + LEAQ 128(DX), DX + JMP gcmAesEncOctetsLoop gcmAesEncOctetsEnd: - - MOVOU (16*0)(SP), T0 - MOVOU (16*0)(pTbl), ACC0 - MOVOU (16*1)(pTbl), ACCM - MOVOU ACC0, ACC1 - PSHUFD $78, T0, T1 - PXOR T0, T1 - PCLMULQDQ $0x00, T0, ACC0 - PCLMULQDQ $0x11, T0, ACC1 - PCLMULQDQ $0x00, T1, ACCM - - mulRound(1) - mulRound(2) - mulRound(3) - mulRound(4) - mulRound(5) - mulRound(6) - mulRound(7) - - PXOR ACC0, ACCM - PXOR ACC1, ACCM - MOVOU ACCM, T0 - PSRLDQ $8, ACCM - PSLLDQ $8, T0 - PXOR ACCM, ACC1 - PXOR T0, ACC0 - - reduceRound(ACC0) - reduceRound(ACC0) - PXOR ACC1, ACC0 - - TESTQ ptxLen, ptxLen - JE gcmAesEncDone - - SUBQ $7, aluCTR + MOVOU (SP), X11 + MOVOU (DI), X8 + MOVOU 16(DI), X10 + MOVOU X8, X9 + PSHUFD $0x4e, X11, X12 + PXOR X11, X12 + PCLMULQDQ $0x00, X11, X8 + PCLMULQDQ $0x11, X11, X9 + PCLMULQDQ $0x00, X12, X10 + MOVOU 16(SP), X11 + MOVOU 32(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X11, X13 + PXOR X13, X9 + PSHUFD $0x4e, X11, X12 + PXOR X12, X11 + MOVOU 48(DI), X12 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X10 + MOVOU 32(SP), X11 + MOVOU 64(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X11, X13 + PXOR X13, X9 + PSHUFD $0x4e, X11, X12 + PXOR X12, X11 + MOVOU 80(DI), X12 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X10 + MOVOU 48(SP), X11 + MOVOU 96(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X11, X13 + PXOR X13, X9 + PSHUFD $0x4e, X11, X12 + PXOR X12, X11 + MOVOU 112(DI), X12 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X10 + MOVOU 64(SP), X11 + MOVOU 128(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X11, X13 + PXOR X13, X9 + PSHUFD $0x4e, X11, X12 + PXOR X12, X11 + MOVOU 144(DI), X12 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X10 + MOVOU 80(SP), X11 + MOVOU 160(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X11, X13 + PXOR X13, X9 + PSHUFD $0x4e, X11, X12 + PXOR X12, X11 + MOVOU 176(DI), X12 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X10 + MOVOU 96(SP), X11 + MOVOU 192(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X11, X13 + PXOR X13, X9 + PSHUFD $0x4e, X11, X12 + PXOR X12, X11 + MOVOU 208(DI), X12 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X10 + MOVOU 112(SP), X11 + MOVOU 224(DI), X12 + MOVOU X12, X13 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PCLMULQDQ $0x11, X11, X13 + PXOR X13, X9 + PSHUFD $0x4e, X11, X12 + PXOR X12, X11 + MOVOU 240(DI), X12 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X10 + PXOR X8, X10 + PXOR X9, X10 + MOVOU X10, X11 + PSRLDQ $0x08, X10 + PSLLDQ $0x08, X11 + PXOR X10, X9 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + PXOR X9, X8 + TESTQ R9, R9 + JE gcmAesEncDone + SUBQ $0x07, R10 gcmAesEncSingles: - - MOVOU (16*1)(ks), B1 - MOVOU (16*2)(ks), B2 - MOVOU (16*3)(ks), B3 - MOVOU (16*4)(ks), B4 - MOVOU (16*5)(ks), B5 - MOVOU (16*6)(ks), B6 - MOVOU (16*7)(ks), B7 - - MOVOU (16*14)(pTbl), T2 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU 224(DI), X13 gcmAesEncSinglesLoop: + CMPQ R9, $0x10 + JB gcmAesEncTail + SUBQ $0x10, R9 + MOVOU 128(SP), X0 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 140(SP) + AESENC X1, X0 + AESENC X2, X0 + AESENC X3, X0 + AESENC X4, X0 + AESENC X5, X0 + AESENC X6, X0 + AESENC X7, X0 + MOVOU 128(AX), X11 + AESENC X11, X0 + MOVOU 144(AX), X11 + AESENC X11, X0 + MOVOU 160(AX), X11 + CMPQ R13, $0x0c + JB encLast3 + AESENC X11, X0 + MOVOU 176(AX), X11 + AESENC X11, X0 + MOVOU 192(AX), X11 + JE encLast3 + AESENC X11, X0 + MOVOU 208(AX), X11 + AESENC X11, X0 + MOVOU 224(AX), X11 - CMPQ ptxLen, $16 - JB gcmAesEncTail - SUBQ $16, ptxLen - - MOVOU (8*16 + 0*16)(SP), B0 - increment(0) - - AESENC B1, B0 - AESENC B2, B0 - AESENC B3, B0 - AESENC B4, B0 - AESENC B5, B0 - AESENC B6, B0 - AESENC B7, B0 - MOVOU (16*8)(ks), T0 - AESENC T0, B0 - MOVOU (16*9)(ks), T0 - AESENC T0, B0 - MOVOU (16*10)(ks), T0 - CMPQ NR, $12 - JB encLast3 - AESENC T0, B0 - MOVOU (16*11)(ks), T0 - AESENC T0, B0 - MOVOU (16*12)(ks), T0 - JE encLast3 - AESENC T0, B0 - MOVOU (16*13)(ks), T0 - AESENC T0, B0 - MOVOU (16*14)(ks), T0 encLast3: - AESENCLAST T0, B0 - - MOVOU (ptx), T0 - PXOR T0, B0 - MOVOU B0, (ctx) - - PSHUFB BSWAP, B0 - PXOR ACC0, B0 - - MOVOU T2, ACC0 - MOVOU T2, ACC1 - MOVOU (16*15)(pTbl), ACCM - - PSHUFD $78, B0, T0 - PXOR B0, T0 - PCLMULQDQ $0x00, B0, ACC0 - PCLMULQDQ $0x11, B0, ACC1 - PCLMULQDQ $0x00, T0, ACCM - - PXOR ACC0, ACCM - PXOR ACC1, ACCM - MOVOU ACCM, T0 - PSRLDQ $8, ACCM - PSLLDQ $8, T0 - PXOR ACCM, ACC1 - PXOR T0, ACC0 - - reduceRound(ACC0) - reduceRound(ACC0) - PXOR ACC1, ACC0 - - LEAQ (16*1)(ptx), ptx - LEAQ (16*1)(ctx), ctx - - JMP gcmAesEncSinglesLoop + AESENCLAST X11, X0 + MOVOU (SI), X11 + PXOR X11, X0 + MOVOU X0, (DX) + PSHUFB X15, X0 + PXOR X8, X0 + MOVOU X13, X8 + MOVOU X13, X9 + MOVOU 240(DI), X10 + PSHUFD $0x4e, X0, X11 + PXOR X0, X11 + PCLMULQDQ $0x00, X0, X8 + PCLMULQDQ $0x11, X0, X9 + PCLMULQDQ $0x00, X11, X10 + PXOR X8, X10 + PXOR X9, X10 + MOVOU X10, X11 + PSRLDQ $0x08, X10 + PSLLDQ $0x08, X11 + PXOR X10, X9 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + PXOR X9, X8 + LEAQ 16(SI), SI + LEAQ 16(DX), DX + JMP gcmAesEncSinglesLoop gcmAesEncTail: - TESTQ ptxLen, ptxLen - JE gcmAesEncDone + TESTQ R9, R9 + JE gcmAesEncDone + MOVOU 128(SP), X0 + AESENC X1, X0 + AESENC X2, X0 + AESENC X3, X0 + AESENC X4, X0 + AESENC X5, X0 + AESENC X6, X0 + AESENC X7, X0 + MOVOU 128(AX), X11 + AESENC X11, X0 + MOVOU 144(AX), X11 + AESENC X11, X0 + MOVOU 160(AX), X11 + CMPQ R13, $0x0c + JB encLast4 + AESENC X11, X0 + MOVOU 176(AX), X11 + AESENC X11, X0 + MOVOU 192(AX), X11 + JE encLast4 + AESENC X11, X0 + MOVOU 208(AX), X11 + AESENC X11, X0 + MOVOU 224(AX), X11 - MOVOU (8*16 + 0*16)(SP), B0 - AESENC B1, B0 - AESENC B2, B0 - AESENC B3, B0 - AESENC B4, B0 - AESENC B5, B0 - AESENC B6, B0 - AESENC B7, B0 - MOVOU (16*8)(ks), T0 - AESENC T0, B0 - MOVOU (16*9)(ks), T0 - AESENC T0, B0 - MOVOU (16*10)(ks), T0 - CMPQ NR, $12 - JB encLast4 - AESENC T0, B0 - MOVOU (16*11)(ks), T0 - AESENC T0, B0 - MOVOU (16*12)(ks), T0 - JE encLast4 - AESENC T0, B0 - MOVOU (16*13)(ks), T0 - AESENC T0, B0 - MOVOU (16*14)(ks), T0 encLast4: - AESENCLAST T0, B0 - MOVOU B0, T0 - - LEAQ -1(ptx)(ptxLen*1), ptx + AESENCLAST X11, X0 + MOVOU X0, X11 + LEAQ -1(SI)(R9*1), SI + MOVQ R9, R11 + SHLQ $0x04, R11 + LEAQ andMask<>+0(SB), R10 + MOVOU -16(R10)(R11*1), X12 + PXOR X0, X0 - MOVQ ptxLen, aluTMP - SHLQ $4, aluTMP - - LEAQ andMask<>(SB), aluCTR - MOVOU -16(aluCTR)(aluTMP*1), T1 - - PXOR B0, B0 ptxLoadLoop: - PSLLDQ $1, B0 - PINSRB $0, (ptx), B0 - LEAQ -1(ptx), ptx - DECQ ptxLen - JNE ptxLoadLoop - - PXOR T0, B0 - PAND T1, B0 - MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT - - PSHUFB BSWAP, B0 - PXOR ACC0, B0 - - MOVOU T2, ACC0 - MOVOU T2, ACC1 - MOVOU (16*15)(pTbl), ACCM - - PSHUFD $78, B0, T0 - PXOR B0, T0 - PCLMULQDQ $0x00, B0, ACC0 - PCLMULQDQ $0x11, B0, ACC1 - PCLMULQDQ $0x00, T0, ACCM - - PXOR ACC0, ACCM - PXOR ACC1, ACCM - MOVOU ACCM, T0 - PSRLDQ $8, ACCM - PSLLDQ $8, T0 - PXOR ACCM, ACC1 - PXOR T0, ACC0 - - reduceRound(ACC0) - reduceRound(ACC0) - PXOR ACC1, ACC0 + PSLLDQ $0x01, X0 + PINSRB $0x00, (SI), X0 + LEAQ -1(SI), SI + DECQ R9 + JNE ptxLoadLoop + PXOR X11, X0 + PAND X12, X0 + MOVOU X0, (DX) + PSHUFB X15, X0 + PXOR X8, X0 + MOVOU X13, X8 + MOVOU X13, X9 + MOVOU 240(DI), X10 + PSHUFD $0x4e, X0, X11 + PXOR X0, X11 + PCLMULQDQ $0x00, X0, X8 + PCLMULQDQ $0x11, X0, X9 + PCLMULQDQ $0x00, X11, X10 + PXOR X8, X10 + PXOR X9, X10 + MOVOU X10, X11 + PSRLDQ $0x08, X10 + PSLLDQ $0x08, X11 + PXOR X10, X9 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + PXOR X9, X8 gcmAesEncDone: - MOVOU ACC0, (tPtr) + MOVOU X8, (R8) RET -#undef increment - -// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) -TEXT ·gcmAesDec(SB),0,$128-96 -#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) -#define combinedDecRound(i) \ - MOVOU (16*i)(ks), T0;\ - AESENC T0, B0;\ - AESENC T0, B1;\ - AESENC T0, B2;\ - AESENC T0, B3;\ - MOVOU (16*(i*2))(pTbl), T1;\ - MOVOU T1, T2;\ - AESENC T0, B4;\ - AESENC T0, B5;\ - AESENC T0, B6;\ - AESENC T0, B7;\ - MOVOU (16*i)(ctx), T0;\ - PSHUFB BSWAP, T0;\ - PCLMULQDQ $0x00, T0, T1;\ - PXOR T1, ACC0;\ - PSHUFD $78, T0, T1;\ - PCLMULQDQ $0x11, T0, T2;\ - PXOR T1, T0;\ - PXOR T2, ACC1;\ - MOVOU (16*(i*2+1))(pTbl), T2;\ - PCLMULQDQ $0x00, T2, T0;\ - PXOR T0, ACCM - - MOVQ productTable+0(FP), pTbl - MOVQ dst+8(FP), ptx - MOVQ src_base+32(FP), ctx - MOVQ src_len+40(FP), ptxLen - MOVQ ctr+56(FP), ctrPtr - MOVQ T+64(FP), tPtr - MOVQ ks_base+72(FP), ks - MOVQ ks_len+80(FP), NR - - SHRQ $2, NR - DECQ NR - - MOVOU bswapMask<>(SB), BSWAP - MOVOU gcmPoly<>(SB), POLY - - MOVOU (tPtr), ACC0 - PXOR ACC1, ACC1 - PXOR ACCM, ACCM - MOVOU (ctrPtr), B0 - MOVL (3*4)(ctrPtr), aluCTR - MOVOU (ks), T0 - MOVL (3*4)(ks), aluK - BSWAPL aluCTR - BSWAPL aluK - - PXOR B0, T0 - MOVOU T0, (0*16)(SP) - increment(0) - CMPQ ptxLen, $128 - JB gcmAesDecSingles - - MOVOU T0, (1*16)(SP) - increment(1) - MOVOU T0, (2*16)(SP) - increment(2) - MOVOU T0, (3*16)(SP) - increment(3) - MOVOU T0, (4*16)(SP) - increment(4) - MOVOU T0, (5*16)(SP) - increment(5) - MOVOU T0, (6*16)(SP) - increment(6) - MOVOU T0, (7*16)(SP) - increment(7) +DATA andMask<>+0(SB)/8, $0x00000000000000ff +DATA andMask<>+8(SB)/8, $0x0000000000000000 +DATA andMask<>+16(SB)/8, $0x000000000000ffff +DATA andMask<>+24(SB)/8, $0x0000000000000000 +DATA andMask<>+32(SB)/8, $0x0000000000ffffff +DATA andMask<>+40(SB)/8, $0x0000000000000000 +DATA andMask<>+48(SB)/8, $0x00000000ffffffff +DATA andMask<>+56(SB)/8, $0x0000000000000000 +DATA andMask<>+64(SB)/8, $0x000000ffffffffff +DATA andMask<>+72(SB)/8, $0x0000000000000000 +DATA andMask<>+80(SB)/8, $0x0000ffffffffffff +DATA andMask<>+88(SB)/8, $0x0000000000000000 +DATA andMask<>+96(SB)/8, $0x00ffffffffffffff +DATA andMask<>+104(SB)/8, $0x0000000000000000 +DATA andMask<>+112(SB)/8, $0xffffffffffffffff +DATA andMask<>+120(SB)/8, $0x0000000000000000 +DATA andMask<>+128(SB)/8, $0xffffffffffffffff +DATA andMask<>+136(SB)/8, $0x00000000000000ff +DATA andMask<>+144(SB)/8, $0xffffffffffffffff +DATA andMask<>+152(SB)/8, $0x000000000000ffff +DATA andMask<>+160(SB)/8, $0xffffffffffffffff +DATA andMask<>+168(SB)/8, $0x0000000000ffffff +DATA andMask<>+176(SB)/8, $0xffffffffffffffff +DATA andMask<>+184(SB)/8, $0x00000000ffffffff +DATA andMask<>+192(SB)/8, $0xffffffffffffffff +DATA andMask<>+200(SB)/8, $0x000000ffffffffff +DATA andMask<>+208(SB)/8, $0xffffffffffffffff +DATA andMask<>+216(SB)/8, $0x0000ffffffffffff +DATA andMask<>+224(SB)/8, $0xffffffffffffffff +DATA andMask<>+232(SB)/8, $0x00ffffffffffffff +GLOBL andMask<>(SB), RODATA|NOPTR, $240 + +// func gcmAesDec(productTable *[256]byte, dst []byte, src []byte, ctr *[16]byte, T *[16]byte, ks []uint32) +// Requires: AES, PCLMULQDQ, SSE2, SSE4.1, SSSE3 +TEXT ·gcmAesDec(SB), $128-96 + MOVQ productTable+0(FP), DI + MOVQ dst_base+8(FP), SI + MOVQ src_base+32(FP), DX + MOVQ src_len+40(FP), R9 + MOVQ ctr+56(FP), CX + MOVQ T+64(FP), R8 + MOVQ ks_base+72(FP), AX + MOVQ ks_len+80(FP), R13 + SHRQ $0x02, R13 + DECQ R13 + MOVOU bswapMask<>+0(SB), X15 + MOVOU gcmPoly<>+0(SB), X14 + MOVOU (R8), X8 + PXOR X9, X9 + PXOR X10, X10 + MOVOU (CX), X0 + MOVL 12(CX), R10 + MOVOU (AX), X11 + MOVL 12(AX), R12 + BSWAPL R10 + BSWAPL R12 + PXOR X0, X11 + MOVOU X11, (SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 12(SP) + CMPQ R9, $0x80 + JB gcmAesDecSingles + MOVOU X11, 16(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 28(SP) + MOVOU X11, 32(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 44(SP) + MOVOU X11, 48(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 60(SP) + MOVOU X11, 64(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 76(SP) + MOVOU X11, 80(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 92(SP) + MOVOU X11, 96(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 108(SP) + MOVOU X11, 112(SP) + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 124(SP) gcmAesDecOctetsLoop: + CMPQ R9, $0x80 + JB gcmAesDecEndOctets + SUBQ $0x80, R9 + MOVOU (SP), X0 + MOVOU 16(SP), X1 + MOVOU 32(SP), X2 + MOVOU 48(SP), X3 + MOVOU 64(SP), X4 + MOVOU 80(SP), X5 + MOVOU 96(SP), X6 + MOVOU 112(SP), X7 + MOVOU (DX), X11 + PSHUFB X15, X11 + PXOR X8, X11 + PSHUFD $0x4e, X11, X12 + PXOR X11, X12 + MOVOU (DI), X8 + MOVOU 16(DI), X10 + MOVOU X8, X9 + PCLMULQDQ $0x00, X12, X10 + PCLMULQDQ $0x00, X11, X8 + PCLMULQDQ $0x11, X11, X9 + MOVOU 16(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 32(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 16(DX), X11 + PSHUFB X15, X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 48(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 12(SP) + MOVOU 32(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 64(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 32(DX), X11 + PSHUFB X15, X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 80(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 28(SP) + MOVOU 48(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 96(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 48(DX), X11 + PSHUFB X15, X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 112(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 44(SP) + MOVOU 64(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 128(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 64(DX), X11 + PSHUFB X15, X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 144(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 60(SP) + MOVOU 80(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 160(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 80(DX), X11 + PSHUFB X15, X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 176(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 76(SP) + MOVOU 96(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 192(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 96(DX), X11 + PSHUFB X15, X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 208(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 92(SP) + MOVOU 112(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + MOVOU 224(DI), X12 + MOVOU X12, X13 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 112(DX), X11 + PSHUFB X15, X11 + PCLMULQDQ $0x00, X11, X12 + PXOR X12, X8 + PSHUFD $0x4e, X11, X12 + PCLMULQDQ $0x11, X11, X13 + PXOR X12, X11 + PXOR X13, X9 + MOVOU 240(DI), X13 + PCLMULQDQ $0x00, X13, X11 + PXOR X11, X10 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 108(SP) + MOVOU 128(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 124(SP) + PXOR X8, X10 + PXOR X9, X10 + MOVOU X10, X11 + PSRLDQ $0x08, X10 + PSLLDQ $0x08, X11 + PXOR X10, X9 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + MOVOU 144(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + PXOR X9, X8 + MOVOU 160(AX), X11 + CMPQ R13, $0x0c + JB decLast1 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 176(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 192(AX), X11 + JE decLast1 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 208(AX), X11 + AESENC X11, X0 + AESENC X11, X1 + AESENC X11, X2 + AESENC X11, X3 + AESENC X11, X4 + AESENC X11, X5 + AESENC X11, X6 + AESENC X11, X7 + MOVOU 224(AX), X11 - CMPQ ptxLen, $128 - JB gcmAesDecEndOctets - SUBQ $128, ptxLen - - MOVOU (0*16)(SP), B0 - MOVOU (1*16)(SP), B1 - MOVOU (2*16)(SP), B2 - MOVOU (3*16)(SP), B3 - MOVOU (4*16)(SP), B4 - MOVOU (5*16)(SP), B5 - MOVOU (6*16)(SP), B6 - MOVOU (7*16)(SP), B7 - - MOVOU (16*0)(ctx), T0 - PSHUFB BSWAP, T0 - PXOR ACC0, T0 - PSHUFD $78, T0, T1 - PXOR T0, T1 - - MOVOU (16*0)(pTbl), ACC0 - MOVOU (16*1)(pTbl), ACCM - MOVOU ACC0, ACC1 - - PCLMULQDQ $0x00, T1, ACCM - PCLMULQDQ $0x00, T0, ACC0 - PCLMULQDQ $0x11, T0, ACC1 - - combinedDecRound(1) - increment(0) - combinedDecRound(2) - increment(1) - combinedDecRound(3) - increment(2) - combinedDecRound(4) - increment(3) - combinedDecRound(5) - increment(4) - combinedDecRound(6) - increment(5) - combinedDecRound(7) - increment(6) - - aesRound(8) - increment(7) - - PXOR ACC0, ACCM - PXOR ACC1, ACCM - MOVOU ACCM, T0 - PSRLDQ $8, ACCM - PSLLDQ $8, T0 - PXOR ACCM, ACC1 - PXOR T0, ACC0 - - reduceRound(ACC0) - aesRound(9) - - reduceRound(ACC0) - PXOR ACC1, ACC0 - - MOVOU (16*10)(ks), T0 - CMPQ NR, $12 - JB decLast1 - aesRnd(T0) - aesRound(11) - MOVOU (16*12)(ks), T0 - JE decLast1 - aesRnd(T0) - aesRound(13) - MOVOU (16*14)(ks), T0 decLast1: - aesRndLast(T0) - - MOVOU (16*0)(ctx), T0 - PXOR T0, B0 - MOVOU (16*1)(ctx), T0 - PXOR T0, B1 - MOVOU (16*2)(ctx), T0 - PXOR T0, B2 - MOVOU (16*3)(ctx), T0 - PXOR T0, B3 - MOVOU (16*4)(ctx), T0 - PXOR T0, B4 - MOVOU (16*5)(ctx), T0 - PXOR T0, B5 - MOVOU (16*6)(ctx), T0 - PXOR T0, B6 - MOVOU (16*7)(ctx), T0 - PXOR T0, B7 - - MOVOU B0, (16*0)(ptx) - MOVOU B1, (16*1)(ptx) - MOVOU B2, (16*2)(ptx) - MOVOU B3, (16*3)(ptx) - MOVOU B4, (16*4)(ptx) - MOVOU B5, (16*5)(ptx) - MOVOU B6, (16*6)(ptx) - MOVOU B7, (16*7)(ptx) - - LEAQ 128(ptx), ptx - LEAQ 128(ctx), ctx - - JMP gcmAesDecOctetsLoop + AESENCLAST X11, X0 + AESENCLAST X11, X1 + AESENCLAST X11, X2 + AESENCLAST X11, X3 + AESENCLAST X11, X4 + AESENCLAST X11, X5 + AESENCLAST X11, X6 + AESENCLAST X11, X7 + MOVOU (DX), X11 + PXOR X11, X0 + MOVOU 16(DX), X11 + PXOR X11, X1 + MOVOU 32(DX), X11 + PXOR X11, X2 + MOVOU 48(DX), X11 + PXOR X11, X3 + MOVOU 64(DX), X11 + PXOR X11, X4 + MOVOU 80(DX), X11 + PXOR X11, X5 + MOVOU 96(DX), X11 + PXOR X11, X6 + MOVOU 112(DX), X11 + PXOR X11, X7 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + LEAQ 128(SI), SI + LEAQ 128(DX), DX + JMP gcmAesDecOctetsLoop gcmAesDecEndOctets: - - SUBQ $7, aluCTR + SUBQ $0x07, R10 gcmAesDecSingles: - - MOVOU (16*1)(ks), B1 - MOVOU (16*2)(ks), B2 - MOVOU (16*3)(ks), B3 - MOVOU (16*4)(ks), B4 - MOVOU (16*5)(ks), B5 - MOVOU (16*6)(ks), B6 - MOVOU (16*7)(ks), B7 - - MOVOU (16*14)(pTbl), T2 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU 224(DI), X13 gcmAesDecSinglesLoop: + CMPQ R9, $0x10 + JB gcmAesDecTail + SUBQ $0x10, R9 + MOVOU (DX), X0 + MOVOU X0, X12 + PSHUFB X15, X0 + PXOR X8, X0 + MOVOU X13, X8 + MOVOU X13, X9 + MOVOU 240(DI), X10 + PCLMULQDQ $0x00, X0, X8 + PCLMULQDQ $0x11, X0, X9 + PSHUFD $0x4e, X0, X11 + PXOR X0, X11 + PCLMULQDQ $0x00, X11, X10 + PXOR X8, X10 + PXOR X9, X10 + MOVOU X10, X11 + PSRLDQ $0x08, X10 + PSLLDQ $0x08, X11 + PXOR X10, X9 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + PXOR X9, X8 + MOVOU (SP), X0 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 12(SP) + AESENC X1, X0 + AESENC X2, X0 + AESENC X3, X0 + AESENC X4, X0 + AESENC X5, X0 + AESENC X6, X0 + AESENC X7, X0 + MOVOU 128(AX), X11 + AESENC X11, X0 + MOVOU 144(AX), X11 + AESENC X11, X0 + MOVOU 160(AX), X11 + CMPQ R13, $0x0c + JB decLast2 + AESENC X11, X0 + MOVOU 176(AX), X11 + AESENC X11, X0 + MOVOU 192(AX), X11 + JE decLast2 + AESENC X11, X0 + MOVOU 208(AX), X11 + AESENC X11, X0 + MOVOU 224(AX), X11 - CMPQ ptxLen, $16 - JB gcmAesDecTail - SUBQ $16, ptxLen - - MOVOU (ctx), B0 - MOVOU B0, T1 - PSHUFB BSWAP, B0 - PXOR ACC0, B0 - - MOVOU T2, ACC0 - MOVOU T2, ACC1 - MOVOU (16*15)(pTbl), ACCM - - PCLMULQDQ $0x00, B0, ACC0 - PCLMULQDQ $0x11, B0, ACC1 - PSHUFD $78, B0, T0 - PXOR B0, T0 - PCLMULQDQ $0x00, T0, ACCM - - PXOR ACC0, ACCM - PXOR ACC1, ACCM - MOVOU ACCM, T0 - PSRLDQ $8, ACCM - PSLLDQ $8, T0 - PXOR ACCM, ACC1 - PXOR T0, ACC0 - - reduceRound(ACC0) - reduceRound(ACC0) - PXOR ACC1, ACC0 - - MOVOU (0*16)(SP), B0 - increment(0) - AESENC B1, B0 - AESENC B2, B0 - AESENC B3, B0 - AESENC B4, B0 - AESENC B5, B0 - AESENC B6, B0 - AESENC B7, B0 - MOVOU (16*8)(ks), T0 - AESENC T0, B0 - MOVOU (16*9)(ks), T0 - AESENC T0, B0 - MOVOU (16*10)(ks), T0 - CMPQ NR, $12 - JB decLast2 - AESENC T0, B0 - MOVOU (16*11)(ks), T0 - AESENC T0, B0 - MOVOU (16*12)(ks), T0 - JE decLast2 - AESENC T0, B0 - MOVOU (16*13)(ks), T0 - AESENC T0, B0 - MOVOU (16*14)(ks), T0 decLast2: - AESENCLAST T0, B0 - - PXOR T1, B0 - MOVOU B0, (ptx) - - LEAQ (16*1)(ptx), ptx - LEAQ (16*1)(ctx), ctx - - JMP gcmAesDecSinglesLoop + AESENCLAST X11, X0 + PXOR X12, X0 + MOVOU X0, (SI) + LEAQ 16(SI), SI + LEAQ 16(DX), DX + JMP gcmAesDecSinglesLoop gcmAesDecTail: + TESTQ R9, R9 + JE gcmAesDecDone + MOVQ R9, R11 + SHLQ $0x04, R11 + LEAQ andMask<>+0(SB), R10 + MOVOU -16(R10)(R11*1), X12 + MOVOU (DX), X0 + PAND X12, X0 + MOVOU X0, X12 + PSHUFB X15, X0 + PXOR X8, X0 + MOVOU 224(DI), X8 + MOVOU 240(DI), X10 + MOVOU X8, X9 + PCLMULQDQ $0x00, X0, X8 + PCLMULQDQ $0x11, X0, X9 + PSHUFD $0x4e, X0, X11 + PXOR X0, X11 + PCLMULQDQ $0x00, X11, X10 + PXOR X8, X10 + PXOR X9, X10 + MOVOU X10, X11 + PSRLDQ $0x08, X10 + PSLLDQ $0x08, X11 + PXOR X10, X9 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + MOVOU X14, X11 + PCLMULQDQ $0x01, X8, X11 + PSHUFD $0x4e, X8, X8 + PXOR X11, X8 + PXOR X9, X8 + MOVOU (SP), X0 + ADDL $0x01, R10 + MOVL R10, R11 + XORL R12, R11 + BSWAPL R11 + MOVL R11, 12(SP) + AESENC X1, X0 + AESENC X2, X0 + AESENC X3, X0 + AESENC X4, X0 + AESENC X5, X0 + AESENC X6, X0 + AESENC X7, X0 + MOVOU 128(AX), X11 + AESENC X11, X0 + MOVOU 144(AX), X11 + AESENC X11, X0 + MOVOU 160(AX), X11 + CMPQ R13, $0x0c + JB decLast3 + AESENC X11, X0 + MOVOU 176(AX), X11 + AESENC X11, X0 + MOVOU 192(AX), X11 + JE decLast3 + AESENC X11, X0 + MOVOU 208(AX), X11 + AESENC X11, X0 + MOVOU 224(AX), X11 - TESTQ ptxLen, ptxLen - JE gcmAesDecDone - - MOVQ ptxLen, aluTMP - SHLQ $4, aluTMP - LEAQ andMask<>(SB), aluCTR - MOVOU -16(aluCTR)(aluTMP*1), T1 - - MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow - PAND T1, B0 - - MOVOU B0, T1 - PSHUFB BSWAP, B0 - PXOR ACC0, B0 - - MOVOU (16*14)(pTbl), ACC0 - MOVOU (16*15)(pTbl), ACCM - MOVOU ACC0, ACC1 - - PCLMULQDQ $0x00, B0, ACC0 - PCLMULQDQ $0x11, B0, ACC1 - PSHUFD $78, B0, T0 - PXOR B0, T0 - PCLMULQDQ $0x00, T0, ACCM - - PXOR ACC0, ACCM - PXOR ACC1, ACCM - MOVOU ACCM, T0 - PSRLDQ $8, ACCM - PSLLDQ $8, T0 - PXOR ACCM, ACC1 - PXOR T0, ACC0 - - reduceRound(ACC0) - reduceRound(ACC0) - PXOR ACC1, ACC0 - - MOVOU (0*16)(SP), B0 - increment(0) - AESENC B1, B0 - AESENC B2, B0 - AESENC B3, B0 - AESENC B4, B0 - AESENC B5, B0 - AESENC B6, B0 - AESENC B7, B0 - MOVOU (16*8)(ks), T0 - AESENC T0, B0 - MOVOU (16*9)(ks), T0 - AESENC T0, B0 - MOVOU (16*10)(ks), T0 - CMPQ NR, $12 - JB decLast3 - AESENC T0, B0 - MOVOU (16*11)(ks), T0 - AESENC T0, B0 - MOVOU (16*12)(ks), T0 - JE decLast3 - AESENC T0, B0 - MOVOU (16*13)(ks), T0 - AESENC T0, B0 - MOVOU (16*14)(ks), T0 decLast3: - AESENCLAST T0, B0 - PXOR T1, B0 + AESENCLAST X11, X0 + PXOR X12, X0 ptxStoreLoop: - PEXTRB $0, B0, (ptx) - PSRLDQ $1, B0 - LEAQ 1(ptx), ptx - DECQ ptxLen - - JNE ptxStoreLoop + PEXTRB $0x00, X0, (SI) + PSRLDQ $0x01, X0 + LEAQ 1(SI), SI + DECQ R9 + JNE ptxStoreLoop gcmAesDecDone: - - MOVOU ACC0, (tPtr) + MOVOU X8, (R8) RET diff --git a/src/go/types/stdlib_test.go b/src/go/types/stdlib_test.go index d41a3d10df..c98d67e114 100644 --- a/src/go/types/stdlib_test.go +++ b/src/go/types/stdlib_test.go @@ -357,6 +357,7 @@ var excluded = map[string]bool{ "builtin": true, // See go.dev/issue/46027: some imports are missing for this submodule. + "crypto/aes/_asm/gcm": true, "crypto/internal/bigmod/_asm": true, "crypto/internal/edwards25519/field/_asm": true, "crypto/md5/_asm": true,